# Imports

In [None]:
#!pip install --upgrade data_repo_client
#!pip install --upgrade xmltodict

In [None]:
# Imports
import requests
import json
import google.auth
import xmltodict
import data_repo_client
import pandas as pd
import re
from time import sleep
import ast

# Function to refresh TDR API client
def refresh_tdr_api_client():
    creds, project = google.auth.default()
    auth_req = google.auth.transport.requests.Request()
    creds.refresh(auth_req)
    config = data_repo_client.Configuration()
    config.host = "https://data.terra.bio"
    config.access_token = creds.token
    api_client = data_repo_client.ApiClient(configuration=config)
    api_client.client_side_validation = False
    return api_client

# Display options
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option("display.max_colwidth", None)
pd.set_option('display.width', 1000)
pd.set_option('display.colheader_justify', 'center')
pd.set_option('display.precision', 3)

# Step 1: Collect Metadata for Review

In [None]:
#############################################
## Functions
#############################################

def coalesce(*arg): 
    remove_list = ["", "NA", "N/A", "NONE", "TBD", "UNKNOWN", "UNSPECIFIED"]
    # update to remove N/A, None, Null, TBD
    for input_item in arg:
        if input_item is False or input_item == []:
            return input_item
        elif input_item:
            if isinstance(input_item, list):
                temp_list = [ele for ele in input_item if ele is not None and ele.upper() not in remove_list]
                if temp_list:
                    return temp_list
                else:
                    return []
            else:
                if str(input_item).upper() not in remove_list:
                    return input_item
    return None

def format_description(input_string):
    output_string = input_string if input_string else ""
    output_string = re.sub("\n\n\t", " ", output_string)
    output_string = re.sub("\t", " ", output_string)
    output_string = re.sub("study.cgi\?study_id=|.\/study.cgi\?study_id=", "https://www.ncbi.nlm.nih.gov/projects/gap/cgi-bin/study.cgi?study_id=", output_string)
    return output_string

def format_phs_id(input_str):
    try:
        num = re.search("phs0*([0-9]+)", input_str, re.IGNORECASE).group(1)
    except:
        num = ""
    if num:
        output_str = "phs" + str(num).zfill(6)
    else:
        output_str = ""
    return output_str

def try_join(l):
    try:
        if isinstance(l, list):
            return ', '.join(map(str, l))
        else:
            return l
    except TypeError:
        return l
    
def val_study_type_enum(l):
    if l and l not in ["Observational", "Interventional", "Descriptive", "Analytical", "Prospective", "Retrospective", "Case report", "Case series", "Cross-sectional", "Cohort study"]:
        return 1
    else:
        return 0

def val_nih_inst_center_sub_enum(l):
    if l and l not in ["NCI", "NEI", "NHLBI", "NHGRI", "NIA", "NIAAA", "NIAID", "NIAMS", "NIBIB", "NICHD", "NIDCD", "NIDCR", "NIDDK", "NIDA", "NIEHS", "NIGMS", "NIMH", "NIMHD", "NINDS", "NINR", "NLM", "CC", "CIT", "CSR", "FIC", "NCATS", "NCCIH"]:
        return 1
    else:
        return 0

def val_nih_ic_supp_study_enum(l):
    if l and isinstance(l, list):
        for item in l:
            if item not in ["NCI", "NEI", "NHLBI", "NHGRI", "NIA", "NIAAA", "NIAID", "NIAMS", "NIBIB", "NICHD", "NIDCD", "NIDCR", "NIDDK", "NIDA", "NIEHS", "NIGMS", "NIMH", "NIMHD", "NINDS", "NINR", "NLM", "CC", "CIT", "CSR", "FIC", "NCATS", "NCCIH"]:
                return 1
        return 0
    else:
        return 0

def val_file_type_enum(l):
    if l and isinstance(l, list):
        for item in l:
            if item not in ["Arrays", "Genome", "Exome", "Survey", "Phenotype"]:
                return 1
        return 0
    else:
        return 0

def fetch_dataset_details(snapshot_id, ds_consent_map, duos_token):
    
    # Initialize variables
    terra_dict = {}
    dbgap_xml_dict = {}
    dbgap_study_api_dict = {}
    dbgap_fhir_dict = {}
    final_results_dict = {}
    
    # Retrieve snapshot details
    api_client = refresh_tdr_api_client()
    datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
    snapshots_api = data_repo_client.SnapshotsApi(api_client=api_client)
    attempt_counter = 0
    while attempt_counter <= 2:
        try:
            snapshot_details = snapshots_api.retrieve_snapshot(id=snapshot_id).to_dict()
            break
        except:
            sleep(5)
            attempt_counter += 1  
    snapshot_name = snapshot_details["name"]
    dataset_id = snapshot_details["source"][0]["dataset"]["id"]
    phs_id = format_phs_id(snapshot_details["source"][0]["dataset"]["phs_id"])
    if snapshot_details["source"][0]["dataset"]["secure_monitoring_enabled"] == True:
        access_management = "controlled"
    else:
        access_management = "open"
    if snapshot_details["source"][0]["dataset_properties"].get("source_workspaces"):  
        source_workspace = snapshot_details["source"][0]["dataset_properties"]["source_workspaces"][0]
    else:
        source_workspace = None
    if snapshot_details["source"][0]["dataset_properties"].get("consent_name"):
        snapshot_consent_code = snapshot_details["source"][0]["dataset_properties"]["consent_name"]
    else:
        snapshot_consent_code = None
    if snapshot_details["duos_firecloud_group"] != None:
        duos_id = snapshot_details["duos_firecloud_group"]["duos_id"]
    else:
        duos_id = None
    print("\tSnapshot PHS_ID: " + str(phs_id))
    print("\tSource Workspace: " + str(source_workspace))
    print("\tCurrent DUOS ID: " + str(duos_id))
    
    # Build lookups
    datasets = requests.get(
        url=f"{url}/api/dataset/v3",
        headers={"Authorization": f"Bearer {duos_token}"}
            ).json()
    #PSEUDOCODE - For each snapshot, we want to assign a recorded DUOS ID, target DUOS ID, target Study ID
    try:
        consent_group_name = re.search(r'(.*)_[0-9]{8}_ANV[0-9]+_[0-9]{12}$', snapshot_name).group(1)
    except:
        consent_group_name = snapshot_name
    # Loop through new datasets API
        # Create base consent_group_name from dataset_name to use for comparison (see above regex)
        # Attempt to match the dataset based on the consent group name
            # If match found, stop looking, assign the "identifer" as the target DUOS ID and "study_id" as the target study id
    # Loop through study lookup
        # Attempt to match snapshot PHS ID to a PHS ID for a study
            # If match found, stop looking, and assign this as another target study ID
#     If DUO on Snapshot, use that for both Study and Dataset information
#     If dataset match, use that for both Study and Dataset information
#     If no dataset match, try study match. If study match, use that for Study information
    
        
    
    
    # Pull a list of existing AnVIL studies and datasets from DUOS
studies_processed = set()
results = []
datasets = requests.get(
    url=f"{url}/api/dataset/v2?asCustodian=true",
    headers={"Authorization": f"Bearer {token}"}
).json()
for dataset_entry in datasets:
    if dataset_entry.get("study") and dataset_entry["study"]["studyId"] not in studies_processed:
        study_id = dataset_entry["study"]["studyId"]
        if dataset_entry["study"].get("description") and "Platform: AnVIL" in dataset_entry["study"]["description"]: 
            study_name = dataset_entry["study"]["name"]
            study_phs = ""
            for prop_entry in dataset_entry["study"]["properties"]:
                if prop_entry["key"] == "dbGaPPhsID":
                    study_phs = prop_entry["value"]
                    break
            for dataset_id in dataset_entry["study"]["datasetIds"]:
                dataset_details = requests.get(
                    url=f"{url}/api/dataset/v2/{dataset_id}",
                    headers={"Authorization": f"Bearer {token}"}
                ).json()
                dataset_name = dataset_details["name"]
                dataset_identifier = dataset_details["datasetIdentifier"]
                snapshot_id = ""
                for prop_entry in dataset_entry["properties"]:
                    if prop_entry["propertyName"] == "URL":
                        snapshot_url = prop_entry["propertyValue"]
                        if snapshot_url:
                            if "https://data.terra.bio/snapshots/" in snapshot_url:
                                snapshot_id = snapshot_url.replace("https://data.terra.bio/snapshots/", "")
                        
                results.append([study_id, study_name, study_phs, dataset_id, dataset_identifier, dataset_name, snapshot_id])
        studies_processed.add(study_id)
    
    
    
    # Pull information from existing DUOS registration (if listed)
    if duos_id:
        # Establish credentials
        creds, project = google.auth.default()
        auth_req = google.auth.transport.requests.Request()
        creds.refresh(auth_req)
        
        # Pull existing DUOS study registration
        duos_dict = requests.get(
            url=f"https://consent.dsde-prod.broadinstitute.org/api/dataset/registration/{duos_id}",
            headers={"Authorization": f"Bearer {creds.token}"}
        ).json()
#         print(duos_dict)
        
        # Pull dataset details from DUOS (to get data use info) 
        duos_dataset_id = duos_dict["consentGroups"][0].get("datasetId")
        duos_data_use_dict = {}
        if duos_dataset_id:
            dataset_details = requests.get(
                url=f"https://consent.dsde-prod.broadinstitute.org/api/dataset/v2/{duos_dataset_id}",
                headers={"Authorization": f"Bearer {creds.token}"}
            ).json()
            duos_data_use_dict = dataset_details.get("dataUse")
#         print(duos_data_use_dict)
    
    # Pull information from DUOS
    
    # Pull information from original workspace (if listed)
    if source_workspace:
        # Establish credentials
        creds, project = google.auth.default()
        auth_req = google.auth.transport.requests.Request()
        creds.refresh(auth_req)

        # Pull workspace attributes
        attempt_counter = 0
        while attempt_counter <= 2:
            try:
                ws_attributes = requests.get(
                    url=f"https://api.firecloud.org/api/workspaces/anvil-datastorage/{source_workspace}?fields=workspace.attributes,workspace.authorizationDomain,workspace.googleProject,workspace.bucketName",
                    headers={"Authorization": f"Bearer {creds.token}"}
                ).json()
                break
            except:
                sleep(5)
                attempt_counter += 1
        
        # Map to schema
        if ws_attributes.get("workspace"):
            terra_dict["studyName"] = coalesce(ws_attributes["workspace"]["attributes"].get("library:projectName"), source_workspace) 
            terra_dict["studyType"] = ws_attributes["workspace"]["attributes"].get("library:studyDesign")
            terra_dict["studyDescription"] = ws_attributes["workspace"]["attributes"].get("description")
            if ws_attributes["workspace"]["attributes"].get("library:dataCategory"):
                terra_dict["dataTypes"] = []
                for item in ws_attributes["workspace"]["attributes"]["library:dataCategory"]["items"]:
                    inner_list = item.split(",")
                    for inner_item in inner_list:
                        inner_item = inner_item.replace("'", "").strip()
                        terra_dict["dataTypes"].append(inner_item)
            terra_dict["phenotypeIndication"] = ws_attributes["workspace"]["attributes"].get("library:indication")
            terra_dict["species"] = "Homo sapiens"
            terra_dict["piName"] = ws_attributes["workspace"]["attributes"].get("library:datasetOwner")
            terra_dict["dataCustodianEmail"] = [ws_attributes["workspace"]["attributes"].get("library:contactEmail")]
            if ws_attributes["workspace"]["attributes"].get("tag:tags"):
                for tag in ws_attributes["workspace"]["attributes"].get("tag:tags")["items"]:
                    if "Consortium:" in tag:
                        terra_dict["consortium"] = tag.split(":")[1].strip()
                    elif "dbGaP:" in tag:
                        terra_dict["dbGaPPhsID"] = format_phs_id(tag.split(":")[1].strip())
                        if not phs_id:
                            phs_id = format_phs_id(tag.split(":")[1].strip()) 
            terra_dict["consentGroups.consentCode"] = ws_attributes["workspace"]["attributes"].get("library:dataUseRestriction")
            if ws_attributes["workspace"]["attributes"].get("library:datatype"):
                terra_dict["consentGroups.fileTypes.fileType"] = ws_attributes["workspace"]["attributes"]["library:datatype"]["items"]
            if ws_attributes["workspace"]["attributes"].get("library:numSubjects"):
                terra_dict["consentGroups.numberOfParticipants"] = ws_attributes["workspace"]["attributes"]["library:numSubjects"]
#         print("------------------------------------------------------")
#         print("terra_dict")
#         print(terra_dict)
        
    # Pull information from dbGaP (if phs_id listed)
#     print("PHS ID for dbGaP: " + phs_id)
    if phs_id:
        # Pull and parse XML
        phs_short = phs_id.replace("phs", "")
        dbgap_url = "https://dbgap.ncbi.nlm.nih.gov/ss/dbgapssws.cgi?request=Study&phs=" + phs_short
        attempt_counter = 0
        while attempt_counter <= 2:
            try:
                response = requests.get(url=dbgap_url)
                xml_data = xmltodict.parse(response.text)
                break
            except:
                sleep(5)
                attempt_counter += 1
        study_uid = ""

        # Map to schema
        if xml_data["dbgapss"].get("Study"):
            if isinstance(xml_data["dbgapss"]["Study"], list):
                study_data = xml_data["dbgapss"]["Study"][0]
            else:
                study_data = xml_data["dbgapss"]["Study"] 
            study_uid = study_data.get("@uid")
            dbgap_xml_dict["studyName"] = study_data["StudyInfo"].get("StudyNameEntrez")
            dbgap_xml_dict["studyDescription"] = study_data["StudyInfo"].get("Description")
            dbgap_xml_dict["dbGaPPhsID"] = phs_id
            dbgap_xml_dict["dbGaPStudyRegistrationName"] = study_data["StudyInfo"].get("StudyNameEntrez")
            if study_data["Authority"]["Persons"].get("Person"):
                for ap_entry in study_data["Authority"]["Persons"]["Person"]:
                    if ap_entry["Role"] == "PI":
                        dbgap_xml_dict["piName"] = ap_entry["@lname"] + ", " + ap_entry["@fname"]
                        dbgap_xml_dict["piEmail"] = ap_entry["@email"]
                        dbgap_xml_dict["piInstitution"] = ap_entry["Organization"]
                    elif ap_entry["Role"] == "PO" and ap_entry["Organization"] == "NIH":
                        dbgap_xml_dict["nihProgramOfficerName"] = ap_entry["@lname"] + ", " + ap_entry["@fname"]
                    elif ap_entry["Role"] == "GPA" and ap_entry["Organization"] == "NIH":
                        dbgap_xml_dict["nihGenomicProgramAdministratorName"] = ap_entry["@lname"] + ", " + ap_entry["@fname"]
            ic_list = []
            if isinstance(study_data["Authority"]["ICs"]["IC"], list):
                for ic_entry in study_data["Authority"]["ICs"]["IC"]:
                    ic_list.append(ic_entry["@name"])
            else:
                ic_list.append(study_data["Authority"]["ICs"]["IC"]["@name"])
            dbgap_xml_dict["nihICsSupportingStudy"] = ic_list
            dbgap_xml_dict["consentGroups.numberOfParticipants"] = study_data.get("@num_participants")
            dbgap_xml_dict["embargoReleaseDate"] = study_data["Policy"].get("@pub-embargo")
#             print("------------------------------------------------------")
#             print("dbgap_xml_dict")
#             print(dbgap_xml_dict)
        
        # Pull and parse Study API JSON
        if study_uid:
            dbgap_study_url = "https://submit.ncbi.nlm.nih.gov/dbgap/api/v1/study_config/" + str(study_uid)
            attempt_counter = 0
            while attempt_counter <= 2:
                try:
                    response = requests.get(url=dbgap_study_url)
                    study_api_data = json.loads(response.text)
                    break
                except:
                    sleep(5)
                    attempt_counter += 1
            
            # Map to schema
            if study_api_data.get("error") == None:
                dbgap_study_api_dict["studyName"] = study_api_data["data"].get("report_name")
                dbgap_study_api_dict["studyDescription"] = study_api_data["data"].get("description")
                dbgap_study_api_dict["phenotypeIndication"] = study_api_data["data"].get("primary_disease")
                dbgap_study_api_dict["studyType"] = study_api_data["data"].get("study_design")
                dbgap_study_api_dict["dbGaPPhsID"] = phs_id
                dbgap_study_api_dict["dbGaPStudyRegistrationName"] = study_api_data["data"].get("report_name")
                for attr_entry in study_api_data["data"].get("attribution"):
                    if attr_entry.get("title") == "Principal Investigator":
                        dbgap_study_api_dict["piName"] = attr_entry.get("name")
                        dbgap_study_api_dict["piInstitution"] = attr_entry.get("institute")
                        break
#             print("------------------------------------------------------")
#             print("dbgap_study_api_dict")
#             print(dbgap_study_api_dict)
        
        # Pull and parse FHIR API JSON
        dbgap_fhir_url = "https://dbgap-api.ncbi.nlm.nih.gov/fhir/x1/ResearchStudy?_format=json&_id=" + phs_id
        attempt_counter = 0
        while attempt_counter <= 2:
            try:
                response = requests.get(url=dbgap_fhir_url)
                fhir_data = json.loads(response.text)
                break
            except:
                sleep(5)
                attempt_counter += 1

        # Map to schema
        if fhir_data.get("entry"):
            dbgap_fhir_dict["studyName"] = fhir_data["entry"][0]["resource"].get("title")
            dbgap_fhir_dict["studyDescription"] = fhir_data["entry"][0]["resource"].get("description")
            dbgap_fhir_dict["dbGaPPhsID"] = phs_id
            dbgap_fhir_dict["dbGaPStudyRegistrationName"] = fhir_data["entry"][0]["resource"].get("title")
            # NIH ICs
            if "Organization/" in fhir_data["entry"][0]["resource"]["sponsor"].get("reference"):
                dbgap_fhir_dict["nihICsSupportingStudy"] = [fhir_data["entry"][0]["resource"]["sponsor"].get("reference")[13:]]
            else:
                ic_display = fhir_data["entry"][0]["resource"]["sponsor"].get("display")
                if ic_display == "National Human Genome Research Institute":
                    dbgap_fhir_dict["nihICsSupportingStudy"] = ["NHGRI"]
                else:
                    dbgap_fhir_dict["nihICsSupportingStudy"] = [ic_display]
            # studyType
            if fhir_data["entry"][0]["resource"].get("category"):
                for cat_entry in fhir_data["entry"][0]["resource"].get("category"):
                    if cat_entry.get("coding"):
                        for coding_entry in cat_entry.get("coding"):
                            if coding_entry.get("system") == "https://dbgap-api.ncbi.nlm.nih.gov/fhir/x1/CodeSystem/ResearchStudy-StudyDesign":
                                value = coding_entry.get("display") if coding_entry.get("display") else coding_entry.get("code")
                                if dbgap_fhir_dict.get("studyType") and value:
                                    dbgap_fhir_dict["studyType"] += f", {value}"
                                elif value:
                                    dbgap_fhir_dict["studyType"] = value
            # dataTypes
            dt_list = []
            if fhir_data["entry"][0]["resource"].get("extension"): 
                for ext_entry in fhir_data["entry"][0]["resource"].get("extension"):
                    if ext_entry.get("url") == "https://dbgap-api.ncbi.nlm.nih.gov/fhir/x1/StructureDefinition/ResearchStudy-MolecularDataTypes":
                        for inner_ext_entry in ext_entry.get("extension"):
                            if inner_ext_entry.get("url") == "https://dbgap-api.ncbi.nlm.nih.gov/fhir/x1/StructureDefinition/ResearchStudy-MolecularDataTypes-MolecularDataType":
                                for coding_entry in inner_ext_entry["valueCodeableConcept"].get("coding"):
                                    dt_list.append(coding_entry.get("code"))
            dbgap_fhir_dict["dataTypes"] = dt_list
            # phenotypeIndication
            if fhir_data["entry"][0]["resource"].get("focus"):
                for focus_entry in fhir_data["entry"][0]["resource"].get("focus"):
                    if focus_entry.get("coding"):
                        for coding_entry in focus_entry.get("coding"):
                            value = coding_entry.get("display") if coding_entry.get("display") else coding_entry.get("code")
                            if dbgap_fhir_dict.get("phenotypeIndication") and value:
                                dbgap_fhir_dict["phenotypeIndication"] += f", {value}"
                            elif value:
                                dbgap_fhir_dict["phenotypeIndication"] = value
            # numberOfParticipants
            if fhir_data["entry"][0]["resource"].get("extension"):
                for ext_entry in fhir_data["entry"][0]["resource"].get("extension"):
                    if ext_entry.get("url") == "https://dbgap-api.ncbi.nlm.nih.gov/fhir/x1/StructureDefinition/ResearchStudy-Content":
                        for inner_ext_entry in ext_entry.get("extension"):
                            if inner_ext_entry.get("url") == "https://dbgap-api.ncbi.nlm.nih.gov/fhir/x1/StructureDefinition/ResearchStudy-Content-NumSubjects":
                                dbgap_fhir_dict["consentGroups.numberOfParticipants"] = inner_ext_entry["valueCount"].get("code")
#         print("------------------------------------------------------")
#         print("dbgap_fhir_dict")
#         print(dbgap_fhir_dict)
    
    # Reconcile information and create final results
    consent_code = coalesce(snapshot_consent_code, terra_dict.get("consentGroups.consentCode"), dbgap_fhir_dict.get("consentGroups.consentCode"), dbgap_xml_dict.get("consentGroups.consentCode"), dbgap_study_api_dict.get("consentGroups.consentCode"))
    if consent_code:
        consent_code = consent_code.upper().replace("_", "-")
    else:
        consent_code = ""
    consortium = coalesce(terra_dict.get("consortium"), dbgap_fhir_dict.get("consortium"), dbgap_xml_dict.get("consortium"), dbgap_study_api_dict.get("consortium"))
    dbGaPPhsID = coalesce(dbgap_fhir_dict.get("dbGaPPhsID"), dbgap_xml_dict.get("dbGaPPhsID"), dbgap_study_api_dict.get("dbGaPPhsID"), terra_dict.get("dbGaPPhsID"))
    studyName = coalesce(dbgap_fhir_dict.get("studyName"), dbgap_xml_dict.get("studyName"), dbgap_study_api_dict.get("studyName"), terra_dict.get("studyName"))
    if dbGaPPhsID and consent_code:
        study_consent = dbGaPPhsID + ":" + consent_code
        purl_doid = ds_consent_map.get(study_consent)
        if purl_doid:
            if not isinstance(purl_doid, list):
                purl_doid = [purl_doid]
        else:
            purl_doid = []
    else:
        purl_doid = []
    final_results_dict["snapshot_id"] = snapshot_id
    final_results_dict["duos_id"] = duos_id
    try:
        consent_group_name = re.search(r'(.*)_[0-9]{8}_ANV[0-9]+_[0-9]{12}$', snapshot_name).group(1)
    except:
        consent_group_name = snapshot_name
    if duos_id:
        final_results_dict["studyName"] = duos_dict.get("studyName")
        final_results_dict["studyType"] = duos_dict.get("studyType")
        final_results_dict["studyDescription"] = duos_dict.get("studyDescription")
        final_results_dict["dataTypes"] = duos_dict.get("dataTypes")
        final_results_dict["phenotypeIndication"] = duos_dict.get("phenotypeIndication")
        final_results_dict["species"] = duos_dict.get("species")
        final_results_dict["piName"] = duos_dict.get("piName")
        final_results_dict["dataCustodianEmail"] = duos_dict.get("dataCustodianEmail")
        final_results_dict["publicVisibility"] = duos_dict.get("publicVisibility")
        final_results_dict["nihAnvilUse"] = "I am NHGRI funded and I have a dbGaP PHS ID already" if 'already' in duos_dict.get("nihAnvilUse").lower() else "I am NHGRI funded and I do not have a dbGaP PHS ID"
        final_results_dict["submittingToAnvil"] = duos_dict.get("submittingToAnvil")
        final_results_dict["dbGaPPhsID"] = duos_dict.get("dbGaPPhsID")
        final_results_dict["dbGaPStudyRegistrationName"] = duos_dict.get("dbGaPStudyRegistrationName")
        final_results_dict["embargoReleaseDate"] = duos_dict.get("embargoReleaseDate")
        final_results_dict["sequencingCenter"] = duos_dict.get("sequencingCenter")
        final_results_dict["piEmail"] = duos_dict.get("piEmail")
        final_results_dict["piInstitution"] = duos_dict.get("piInstitution")
        final_results_dict["nihGrantContractNumber"] = duos_dict.get("nihGrantContractNumber")
        final_results_dict["nihICsSupportingStudy"] = duos_dict.get("nihICsSupportingStudy")
        final_results_dict["nihProgramOfficerName"] = duos_dict.get("nihProgramOfficerName")
        final_results_dict["nihInstitutionCenterSubmission"] = duos_dict.get("nihInstitutionCenterSubmission")
        final_results_dict["nihInstitutionalCertificationFileName"] = duos_dict.get("nihInstitutionalCertificationFileName")
        final_results_dict["nihGenomicProgramAdministratorName"] = duos_dict.get("nihGenomicProgramAdministratorName")
        final_results_dict["multiCenterStudy"] = duos_dict.get("multiCenterStudy")
        final_results_dict["collaboratingSites"] = duos_dict.get("collaboratingSites")
        final_results_dict["controlledAccessRequiredForGenomicSummaryResultsGSR"] = duos_dict.get("controlledAccessRequiredForGenomicSummaryResultsGSR")
        final_results_dict["controlledAccessRequiredForGenomicSummaryResultsGSRRequiredExplanation"] = duos_dict.get("controlledAccessRequiredForGenomicSummaryResultsGSRRequiredExplanation")
        final_results_dict["alternativeDataSharingPlan"] = duos_dict.get("alternativeDataSharingPlan")
        final_results_dict["alternativeDataSharingPlanReasons"] = duos_dict.get("alternativeDataSharingPlanReasons")
        final_results_dict["alternativeDataSharingPlanExplanation"] = duos_dict.get("alternativeDataSharingPlanExplanation")
        final_results_dict["alternativeDataSharingPlanFileName"] = duos_dict.get("alternativeDataSharingPlanFileName")
        final_results_dict["alternativeDataSharingPlanDataSubmitted"] = duos_dict.get("alternativeDataSharingPlanDataSubmitted")
        final_results_dict["alternativeDataSharingPlanDataReleased"] = duos_dict.get("alternativeDataSharingPlanDataReleased")
        final_results_dict["alternativeDataSharingPlanTargetDeliveryDate"] = duos_dict.get("alternativeDataSharingPlanTargetDeliveryDate")
        final_results_dict["alternativeDataSharingPlanTargetPublicReleaseDate"] = duos_dict.get("alternativeDataSharingPlanTargetPublicReleaseDate")
        final_results_dict["alternativeDataSharingPlanAccessManagement"] = duos_dict.get("alternativeDataSharingPlanAccessManagement")
        final_results_dict["consentGroups.consentGroupName"] = consent_group_name
        final_results_dict["consentGroups.accessManagement"] = access_management
        final_results_dict["consentGroups.numberOfParticipants"] = duos_dict["consentGroups"][0].get("numberOfParticipants")
        final_results_dict["consentCode"] = consent_code
        final_results_dict["consentGroups.generalResearchUse"] = coalesce(duos_dict["consentGroups"][0].get("generalResearchUse"), duos_data_use_dict.get("generalUse"), False)
        final_results_dict["consentGroups.hmb"] = coalesce(duos_dict["consentGroups"][0].get("hmb"), duos_data_use_dict.get("hmbResearch"), False)
        final_results_dict["consentGroups.diseaseSpecificUse"] = coalesce(duos_dict["consentGroups"][0].get("diseaseSpecificUse"), duos_data_use_dict.get("diseaseRestrictions"), [])
        final_results_dict["consentGroups.gs"] = coalesce(duos_dict["consentGroups"][0].get("gs"), duos_data_use_dict.get("geographicalRestrictions"))
        final_results_dict["consentGroups.poa"] = coalesce(duos_dict["consentGroups"][0].get("poa"), duos_data_use_dict.get("populationOriginsAncestry"), False)
        final_results_dict["consentGroups.nmds"] = coalesce(duos_dict["consentGroups"][0].get("nmds"), False)
        final_results_dict["consentGroups.gso"] = coalesce(duos_dict["consentGroups"][0].get("gso"), duos_data_use_dict.get("geneticStudiesOnly"), False)
        final_results_dict["consentGroups.pub"] = coalesce(duos_dict["consentGroups"][0].get("pub"), duos_data_use_dict.get("publicationResults"), False)
        final_results_dict["consentGroups.col"] = coalesce(duos_dict["consentGroups"][0].get("col"), duos_data_use_dict.get("collaboratorRequired"), False)
        final_results_dict["consentGroups.irb"] = coalesce(duos_dict["consentGroups"][0].get("irb"), duos_data_use_dict.get("ethicsApprovalRequired"), False)
        final_results_dict["consentGroups.npu"] = coalesce(duos_dict["consentGroups"][0].get("npu"), False)
        final_results_dict["consentGroups.otherPrimary"] = duos_dict["consentGroups"][0].get("otherPrimary")
        final_results_dict["consentGroups.otherSecondary"] = duos_dict["consentGroups"][0].get("otherSecondary")
        final_results_dict["consentGroups.mor"] = duos_dict["consentGroups"][0].get("mor")
        final_results_dict["consentGroups.morDate"] = duos_dict["consentGroups"][0].get("morDate")
        final_results_dict["consentGroups.dataLocation"] = "TDR Location"
        final_results_dict["consentGroups.url"] = "https://data.terra.bio/snapshots/" + snapshot_id
        if duos_dict["consentGroups"][0]["fileTypes"] and duos_dict["consentGroups"][0]["fileTypes"].get("fileType"):
            final_results_dict["consentGroups.fileTypes.fileType"] = duos_dict["consentGroups"][0]["fileTypes"][0].get("fileType")
        else:
            final_results_dict["consentGroups.fileTypes.fileType"] = None
        if duos_dict["consentGroups"][0]["fileTypes"] and duos_dict["consentGroups"][0]["fileTypes"].get("functionalEquivalence"):
            final_results_dict["consentGroups.fileTypes.functionalEquivalence"] = duos_dict["consentGroups"][0]["fileTypes"][0].get("functionalEquivalence")
        else:
            final_results_dict["consentGroups.fileTypes.functionalEquivalence"] = None
        final_results_dict["consortium"] = consortium
    else:
        if dbGaPPhsID:
            final_results_dict["studyName"] = studyName + f" ({dbGaPPhsID})"
        else:
            final_results_dict["studyName"] = studyName
        final_results_dict["studyType"] = coalesce(dbgap_fhir_dict.get("studyType"), dbgap_xml_dict.get("studyType"), dbgap_study_api_dict.get("studyType"), terra_dict.get("studyType"))
        final_results_dict["studyDescription"] = format_description(coalesce(dbgap_fhir_dict.get("studyDescription"), dbgap_xml_dict.get("studyDescription"), dbgap_study_api_dict.get("studyDescription"), terra_dict.get("studyDescription")))
        if final_results_dict["studyDescription"]:
            final_results_dict["studyDescription"] = final_results_dict["studyDescription"] + "\nPlatform: AnVIL"
        else:
            final_results_dict["studyDescription"] = "Platform: AnVIL"
        final_results_dict["dataTypes"] = coalesce(terra_dict.get("dataTypes"), dbgap_fhir_dict.get("dataTypes"), dbgap_xml_dict.get("dataTypes"), dbgap_study_api_dict.get("dataTypes"))
        final_results_dict["phenotypeIndication"] = coalesce(terra_dict.get("phenotypeIndication"), dbgap_fhir_dict.get("phenotypeIndication"), dbgap_xml_dict.get("phenotypeIndication"), dbgap_study_api_dict.get("phenotypeIndication"))
        final_results_dict["species"] = "Human"
        final_results_dict["piName"] = coalesce(dbgap_fhir_dict.get("piName"), dbgap_xml_dict.get("piName"), dbgap_study_api_dict.get("piName"), terra_dict.get("piName"), "None")
        final_results_dict["dataCustodianEmail"] = ["help@lists.anvilproject.org"]
        final_results_dict["publicVisibility"] = True
        final_results_dict["nihAnvilUse"] = "I am NHGRI funded and I have a dbGaP PHS ID already" if dbGaPPhsID else "I am NHGRI funded and I do not have a dbGaP PHS ID"
        final_results_dict["submittingToAnvil"] = True
        final_results_dict["dbGaPPhsID"] = dbGaPPhsID
        final_results_dict["dbGaPStudyRegistrationName"] = coalesce(dbgap_fhir_dict.get("dbGaPStudyRegistrationName"), dbgap_xml_dict.get("dbGaPStudyRegistrationName"), dbgap_study_api_dict.get("dbGaPStudyRegistrationName"), terra_dict.get("dbGaPStudyRegistrationName"))
        final_results_dict["embargoReleaseDate"] = coalesce(dbgap_fhir_dict.get("embargoReleaseDate"), dbgap_xml_dict.get("embargoReleaseDate"), dbgap_study_api_dict.get("embargoReleaseDate"), terra_dict.get("embargoReleaseDate"))
        final_results_dict["sequencingCenter"] = None
        final_results_dict["piEmail"] = coalesce(dbgap_fhir_dict.get("piEmail"), dbgap_xml_dict.get("piEmail"), dbgap_study_api_dict.get("piEmail"), terra_dict.get("piEmail"))
        final_results_dict["piInstitution"] = coalesce(dbgap_fhir_dict.get("piInstitution"), dbgap_xml_dict.get("piInstitution"), dbgap_study_api_dict.get("piInstitution"), terra_dict.get("piInstitution"))
        final_results_dict["nihGrantContractNumber"] = None
        final_results_dict["nihICsSupportingStudy"] = coalesce(dbgap_fhir_dict.get("nihICsSupportingStudy"), dbgap_xml_dict.get("nihICsSupportingStudy"), dbgap_study_api_dict.get("nihICsSupportingStudy"), terra_dict.get("nihICsSupportingStudy"))
        final_results_dict["nihProgramOfficerName"] = coalesce(dbgap_fhir_dict.get("nihProgramOfficerName"), dbgap_xml_dict.get("nihProgramOfficerName"), dbgap_study_api_dict.get("nihProgramOfficerName"), terra_dict.get("nihProgramOfficerName"))
        final_results_dict["nihInstitutionCenterSubmission"] = "NHGRI"
        final_results_dict["nihInstitutionalCertificationFileName"] = None
        final_results_dict["nihGenomicProgramAdministratorName"] = coalesce(dbgap_fhir_dict.get("nihGenomicProgramAdministratorName"), dbgap_xml_dict.get("nihGenomicProgramAdministratorName"), dbgap_study_api_dict.get("nihGenomicProgramAdministratorName"), terra_dict.get("nihGenomicProgramAdministratorName"))
        final_results_dict["multiCenterStudy"] = None
        final_results_dict["collaboratingSites"] = [consortium] if consortium else []
        final_results_dict["controlledAccessRequiredForGenomicSummaryResultsGSR"] = None
        final_results_dict["controlledAccessRequiredForGenomicSummaryResultsGSRRequiredExplanation"] = None
        final_results_dict["alternativeDataSharingPlan"] = False
        final_results_dict["alternativeDataSharingPlanReasons"] = []
        final_results_dict["alternativeDataSharingPlanExplanation"] = None
        final_results_dict["alternativeDataSharingPlanFileName"] = None
        final_results_dict["alternativeDataSharingPlanDataSubmitted"] = None
        final_results_dict["alternativeDataSharingPlanDataReleased"] = None
        final_results_dict["alternativeDataSharingPlanTargetDeliveryDate"] = None
        final_results_dict["alternativeDataSharingPlanTargetPublicReleaseDate"] = None
        final_results_dict["alternativeDataSharingPlanAccessManagement"] = None
        final_results_dict["consentGroups.consentGroupName"] = consent_group_name
        final_results_dict["consentGroups.accessManagement"] = access_management
        final_results_dict["consentGroups.numberOfParticipants"] = coalesce(terra_dict.get("consentGroups.numberOfParticipants"), dbgap_fhir_dict.get("consentGroups.numberOfParticipants"), dbgap_xml_dict.get("consentGroups.numberOfParticipants"), dbgap_study_api_dict.get("consentGroups.numberOfParticipants"), "0")
        final_results_dict["consentCode"] = consent_code
        final_results_dict["consentGroups.generalResearchUse"] = True if access_management == "controlled" and "GRU" in consent_code else False
        final_results_dict["consentGroups.hmb"] = True if access_management == "controlled" and "HMB" in consent_code else False
        if purl_doid:
            final_results_dict["consentGroups.diseaseSpecificUse"] = purl_doid
        else:
            final_results_dict["consentGroups.diseaseSpecificUse"] = []
        final_results_dict["consentGroups.gs"] = consent_code if access_management == "controlled" and "GS-" in consent_code else None
        final_results_dict["consentGroups.poa"] = True if access_management == "controlled" and "POA" in consent_code else False
        final_results_dict["consentGroups.nmds"] = True if access_management == "controlled" and "NMDS" in consent_code else False
        final_results_dict["consentGroups.gso"] = True if access_management == "controlled" and "GSO" in consent_code else False
        final_results_dict["consentGroups.pub"] = True if access_management == "controlled" and "PUB" in consent_code else False 
        final_results_dict["consentGroups.col"] = True if access_management == "controlled" and "COL-" in consent_code else False
        final_results_dict["consentGroups.irb"] = True if access_management == "controlled" and "IRB" in consent_code else False
        final_results_dict["consentGroups.npu"] = True if access_management == "controlled" and "NPU" in consent_code else False
        final_results_dict["consentGroups.otherPrimary"] = consent_code if (consent_code and access_management == "controlled" and not (final_results_dict["consentGroups.generalResearchUse"] or final_results_dict["consentGroups.hmb"] or final_results_dict["consentGroups.diseaseSpecificUse"] or final_results_dict["consentGroups.gs"] or final_results_dict["consentGroups.poa"] or final_results_dict["consentGroups.nmds"] or final_results_dict["consentGroups.gso"] or final_results_dict["consentGroups.pub"] or final_results_dict["consentGroups.col"] or final_results_dict["consentGroups.irb"] or final_results_dict["consentGroups.npu"])) else None
        final_results_dict["consentGroups.otherSecondary"] = None
        final_results_dict["consentGroups.mor"] = None
        final_results_dict["consentGroups.morDate"] = None
        final_results_dict["consentGroups.dataLocation"] = "TDR Location"
        final_results_dict["consentGroups.url"] = "https://data.terra.bio/snapshots/" + snapshot_id
        final_results_dict["consentGroups.fileTypes.fileType"] = coalesce(terra_dict.get("consentGroups.fileTypes.fileType"), dbgap_fhir_dict.get("consentGroups.fileTypes.fileType"), dbgap_xml_dict.get("consentGroups.fileTypes.fileType"), dbgap_study_api_dict.get("consentGroups.fileTypes.fileType"))
        final_results_dict["consentGroups.fileTypes.functionalEquivalence"] = None
        final_results_dict["consortium"] = consortium
    
    # Return results
    return final_results_dict


#############################################
## Input Parameters
#############################################

# Specify the snapshots to pull data for:
snapshot_id_list = [
    'dd2b61fb-d420-4a38-9cd2-8464f51d7617',
    '253e2b36-1674-482b-bfbd-4e0b05cdfe63',
    '3f53e841-ca9d-4b55-b390-590718533561',
    '01cf2450-604b-43e5-9f4e-9ec4e0bf0a61',
    '85b0b351-cd0a-4efe-95a4-e39273c42831',
    '0b31081f-1bce-490a-bd0a-b1aa0fd0daf6',
    '76ec3691-30f3-43cd-af8b-e73c80da90b9',
    '62a4b183-9157-4320-96e6-32f79c561399',
    '553ba443-b8cc-4d8e-9743-e384116a1236',
    '3fc2937c-dc08-400f-9458-3779de623bd0',
    'bfab39d1-1a38-4884-a139-be2809378e7b',
    '99a1ace0-aa83-4d9d-9e9c-e9b6b0111ba2',
    '2fdba9a4-6593-439a-a7fc-c3a5825c26cd',
    '18a28450-31ec-4e4a-a305-dbbdd226ae3c',
    'f7d225d9-1675-483d-a1eb-9ef750301cd4',
    'b13297c4-bb9a-4222-b069-9efcdc9d7ac3',
    'c753046a-cf9b-4813-be68-cb3b9dd9866e',
    '37374406-0c9c-4b94-bdb5-d4f9daf3b335',
    'ad4ed62b-bf63-4dff-ab94-70a6432c161c',
    'c195ed64-842d-4525-8a1c-9083eccaafa7',
    '84103748-39eb-45f9-b4ca-e23a9e52d0cc',
    '96cfd9ec-5eb4-48f5-9284-84f349701033',
    '8f104c7f-8b5a-489e-95b4-616130405e7c',
    'b300b5ae-6ca3-4350-bc46-345173f6faba',
    'a2bb366f-029d-4bef-8da7-dce818743881',
    '7bc891a2-a634-4cf2-b41e-0b1e98fce599',
    'f2f0d8a3-6e18-436a-bc51-ff742d30d6a4',
    '2b2a1f74-9c2e-4c6c-840b-80f466d1e209',
    '9dbac1be-a33c-419c-be92-d1a5452c1292',
    '3442e75a-7452-4680-9ce5-70fa21363083',
    '79509151-b96a-483f-a6b3-eeede54467d1',
    '40d6feec-e6f7-42f1-8e74-a3404e1f9208',
    '887dd90c-6742-4283-92b2-bc9ed6bc2ae1',
    '95b4c57b-8e88-45f5-9dbb-e2575f4b2a68',
    'ec292668-bc78-45f7-b601-8d452b038e6c',
    '432c6422-ceaf-4c6c-bd8d-7c90771a284f',
    '8d055301-47e0-4384-9746-8bc1b93d9a96',
    '6f1d6a31-1997-4b59-a311-f84631ebdcbf',
    '79502d0c-bc1c-4d51-a6de-eb0334b3b660',
    '0565b2e4-ade1-46e7-80bf-ca647a89a8b8',
    '0af0d35e-1f9a-464d-80fe-474b5dbbd914',
    '2aee2dfa-a819-4beb-b8a0-c07d5d577470',
    '514bbe9f-0ffc-47a6-b25e-fe01fe26b720',
    '79c20af6-5788-47ce-9651-f6a6ae084cbc',
    'ea08adf0-2383-41ae-a91a-88c7b8f6f42b',
    'f5626dd5-b0f3-4c59-b7e1-e7fce6488419',
    'f90f565e-0ade-4750-a308-5c8e1677b43d',
    'a51570ab-b0f7-4f30-bdad-ef25e5a6e9a9',
    '194c4b14-cb6a-469c-83db-d37f7ec65f29',
    '72f7d3a3-8c70-46cb-93eb-f258a5577fd8',
    '33c73ae8-f829-438d-bdb1-da0be8f3773f',
    '3d6afb8e-dbcd-4972-8281-ae546b23356c',
    '88b16321-7f0a-44b1-8131-d4b2188d9839',
    '08f28ada-3fa1-41f3-a7eb-5b4ff8325145',
    '12d5b6d6-0942-4759-978f-768c92b9f2dd',
    '9345adce-2f83-4c02-8859-72ddccb22069',
    '75f5452d-ceca-402e-bfc4-759c8352f4da',
    'cdcfc6ac-6c9f-4d99-a8c3-4d1e5d171261',
    '245805dc-d7ab-4a78-bb35-18e1635e6ba5',
    'ecef49e8-2fa4-4507-ada8-2c8d9ad39417',
    '218f6eb5-a71f-4e2f-bc6d-ed6df248422a',
    'd3047f14-4202-4986-9daa-673a578eebcc',
    '57457a3b-ed1e-4d48-8585-ef8a4b053c64',
    '651049a2-2950-4be3-b755-6d133233a010',
    '16f19aec-a9df-4ca8-84fb-b892e9a40ea8',
    '6441b9e0-ca7b-4ab4-b7e7-9c7c7041ebaa',
    'ab9e5f9a-a829-4a63-bb72-d3cdb2d02ecf',
    'ae0c27d6-c8e3-4dd5-abf5-06e5f39fc4a0',
    '49e0dc54-7254-43c5-849e-7b1434638f73',
    'cb5a6268-c0c8-433d-b62c-7beeeb0a6a92',
    'c4ed93d6-b3f0-4f3f-844f-07d90366b64d',
    'ad18153b-870c-491a-9d4e-df30d902a03f',
    '76ee61e9-ca73-4f0b-8c7d-26f7d0a0a383',
    '0f6db24a-05d6-46fc-9ff6-795e29d10ca5',
    'a8d89992-838e-474e-86b7-b3384ce6d6a6',
    '19c8ccd5-fc2e-4c45-984e-453994dab156',
    '127fcfd9-565f-4d05-a91a-5a508ece85bd',
    'b5b89490-cc7a-44fe-8de2-f0934819c22f',
    '10d94161-9a9a-419f-8f80-b6f6b1f03f66',
    'ca17fec7-109d-4534-b969-5e0246249196',
    'e18cd59b-cb26-426b-ade1-e4342b082a6c',
    '66544f8c-034f-487e-a923-8418eb6c4f94',
    '5fe081df-aede-4283-9ee9-5858ad8d4d85',
    'e2736891-a569-449e-8cbf-b7d0274b64d0',
    '92e0bbb7-3bde-4382-984c-55324e415685',
    'bf6c8799-c680-4dc3-abd0-03589d98cf26',
    'ae61a5ab-7a98-446e-8520-e9198b6a039c',
    '6f1efaa0-0b77-4719-8d15-0d9afc01d91a',
    '602f3f08-3067-4fda-b586-1e114a03151e',
    'f14783da-5251-4800-abbd-0dbd18b2d306',
    '3a599138-282a-4dec-9a29-40bc0885321e',
    'b32c07bc-d899-4194-ba1d-a29ec70ae0b5',
    '5b4c1063-a1de-46af-a844-ce56800548d8',
    'f105f715-7bd0-4103-9c12-e097f902fc35',
    'ac852b54-1578-4812-bd62-8544617b1c00',
    '495329a6-fc6f-4088-a7d8-afd5d3148bfd',
    '30c2f5b3-80d2-4b96-846e-3774160c0417',
    '16c066f7-d2c1-4920-966c-c545e9d1d114',
    'f68344f4-4aee-4ce5-aa8f-44cd24e934e2',
    '4e711c00-75bb-4b1f-a4ee-ec8e47e2b9af',
    '0dfe1df2-8139-4fba-9afb-bd47d1a2fbf6',
    '7300865c-2f5f-412d-a40c-0eef523a1738',
    '6287a9aa-0556-4ef3-89b4-e9a16da6f71c',
    'f8e05104-0369-49f8-8469-0b49d3de1ebd',
    '9ebfd2a3-aeba-4aa4-a38f-6fbbf794c7af',
    '6babae16-eb39-4fcf-8bcb-5d4896fc2cd3',
    '269300d9-c82c-4fbc-be11-f27cc7a010bf',
    'e1560527-6e20-447c-835c-44b10fa20b79',
    '19ba7e89-3d13-4f19-a7a7-04ad93185b44',
    '02b4a9fc-4e4d-4977-be04-977ea8f88176',
    '492d02fd-2194-4c1d-a888-e665a068b35f',
    '273be8c5-6303-47a6-8b33-57f65cc88840',
    'bea23c53-b3c0-488a-a997-52ab2fe38f01',
    '74bb77d8-e245-4305-8c3f-65385d331fb9',
    '2b2d0eb2-6d1f-4072-ab28-082ec1b054e6',
    'e0d46f3a-5872-4190-ae94-8eddca9d65d0',
    '07d82c74-91a0-4eaa-87e0-a6f055d9a5c6',
    '341f76ba-c06c-4e58-a9d5-7e9f740d621d',
    '80969539-637f-4a71-bdfa-fdaf414cf8b4',
    '9a747c42-9058-4fcf-9fda-7b355e42d7b3',
    'cd3d7010-c63c-467f-b585-abe1b3da4e48',
    'd761e8f3-45e7-4d2d-99b9-462ace937e68',
    'bfadebce-def1-4e3e-97e1-0b768188db02',
    'c138efe7-9400-4ad6-b23d-287d06ab2179',
    '95dc1d22-9bff-4363-bddb-b29c266b4e28',
    '67211908-3193-4d4f-9966-a5de8548b4d0',
    'ff1d59ff-dc35-482a-933a-e9d9a1eb6a20',
    '6c6b260b-0a17-4cfa-ad5a-8cc5a75c2188',
    '363888df-5cb4-4c07-9218-06938d219c2e',
    '835c7254-7b6d-4db2-9f91-c3a5261304af',
    '461d7216-0d2c-4349-90eb-9a8e5db4d3c3',
    '4e682923-5e29-48e4-a3ff-76d86c08cf8d',
    'f8404885-5a20-4c14-a75c-5711262868fb',
    'dbd5c82f-0e81-4d4d-9f29-a34a2404fbbe',
    '6d95d827-c9b5-4296-9b90-15dc646bb00d',
    '3e4ebe7c-b5d4-4239-95da-03da7d8dedd3',
    '3cce8ea2-297c-4097-9c13-c3f1cadad921',
    '9986e29b-1b35-439d-83c9-2120679e1860',
    '12add555-dbc1-45a5-a5c4-d3d9a172759e',
    'd197888e-7be6-4bf0-b33a-0919236481b2',
    '3633eeb2-b317-4d17-9daa-4a5ca479c05b',
    'e28d0ba2-5523-4b40-b34e-0dd80653dd0a',
    'b378d487-7de4-41b1-aca9-050c6e5deef9',
    '57b52802-5caf-4611-aa0a-7371dd11d221',
    '455c8618-fe26-4424-87ae-42b1fbaeb9d9',
    '7a8e14f1-ffec-47d1-ae87-fdbb8267d427',
    '82b5cf36-94e7-408a-a4d3-db797a0ffbe9',
    'b6d3176b-525a-417a-bda0-def9611bf08a',
    '1d676097-4e9e-4c5a-8ee1-63d865054897',
    'a1df0d3b-4871-4371-8418-58a302719e6e',
    'ee5ca91d-01d8-43a9-a571-16b1390109b1',
    'ea14aac8-3c44-4f09-acdd-34d22a0169a2',
    '9c3e6a87-ac6b-4f3a-bd86-c2e0ce9051e9',
    'dc9ed67a-62da-48a4-89eb-61d86474659b',
    '441b79d9-6142-44d9-9aa9-05d4d03bc118',
    '30387b65-4b3c-4f9a-9f15-f57f30ff76ef',
    '83cfe90a-6c9e-44ca-aae4-16ed7f78554a',
    '3877f3c9-bd2f-4f86-b97b-a5bec85f9f3c',
    'f183e8af-8728-4ad0-bac0-fb68a7eb9bef',
    '83a1eee3-0395-4916-a62b-a37b24d9ca78',
    'cbc80926-dd3e-4ff8-8d8f-77078f260c7e',
    'c2f86bff-92c2-4c35-a5ec-f284bfc934b9',
    'c5da9730-1af8-4944-9dc8-273f6c845731',
    '80b8af9b-d54e-447a-9a53-e1b1c12b7e55',
    'c53cc8ed-7b5f-4c7d-ba7f-c3520856c082',
    '991b8415-06bf-4527-9753-0345b32cc4b0',
    'f033bb12-7aef-4b7e-86bb-448b8e9f1c58',
    '138c04b9-bf59-45c9-89d9-630fe606074e',
    '23dbf4be-b4c1-492a-b754-941626d03c53',
    '47cf45d3-8054-4e53-b569-ecb7d47d72b7',
    '4574e2f2-832b-430d-9558-f9ea6088cbe6',
    'a504feea-036f-4627-83c8-4cbb0e42da65',
    'ed24c069-fde1-443a-8bf2-77ec8b4e86dd',
    '58226893-e7d5-4ea3-9195-d512c70dacf9',
    '1623f347-3bb0-40cd-a9c6-207d0278025f',
    'b9992098-e09c-44f4-b091-b290b12dfc10',
    '5dcb42e6-3702-4764-a3be-2829f704f176',
    '30092c80-3b91-4433-ae6b-8085b0a19a5b',
    'e1e5a2c3-e046-4483-ad39-909980026783',
    '5a8aef0b-e101-4b9e-8cc5-da005295e42a',
    '22331a2a-42d9-47e8-a6a9-ff6fb3e71ee5',
    '15d118c9-4954-41ef-920a-bbce759e5ed0',
    'cb787d4a-8f56-4e79-a0da-2e4281e30362',
    '461c1b26-7306-4feb-b141-f83c209baf27',
    '36e807b4-3e10-41fe-a92b-21fa352648e6',
    '9e2f0ab6-f964-4aa8-a83c-894d716d55ce',
    '6b129887-63b8-41a9-aa5e-0ed83755c58f',
    'd7b2b2c6-72fd-4084-af34-a86edfe3ac47',
    'd63a63ce-24c8-413a-89c0-4bd4c82370c0',
    '6a242848-a716-4de9-ab38-3c82983810a8',
    'c48c956a-1ede-4c6e-805d-46754dc58126',
    'b116216f-8ee1-4058-b40e-0b33b0928107',
    'aeee7408-eb4e-42a9-956f-bb61759f2f55',
    'f3e644af-d04a-4bf1-8dc2-f2932e98ac89',
    '775b8b5f-9e9c-477f-a97d-a4307343b28a',
    '1b66db3a-1ef6-4f05-b8a3-b0765fa9407d',
    '7fffbc86-4ac1-4fb4-8e47-13b83706a6bd',
    'b24fe2ae-a0a2-4ac7-ad2a-f810a0b88a9b',
    'bb70de9b-a893-476b-b698-b1ee228831ee',
    '1bccd783-15a8-4191-b08f-b8a5556bfd52',
    '45cdfb71-f149-4910-89cb-446111ba741f',
    '4ca93f73-cf6c-47b9-854c-8465e79c7fc8',
    '8efb7fd2-1395-40f6-83fa-4ed459a3370e',
    'efaea9d6-0c9c-41aa-b739-600a431e8f58',
    '73f0ac44-748b-4b94-9e53-5b5eb3507ffa',
    'b8a6edf5-3636-47ab-94f6-d948f2d14571',
    '7b00e1cf-e811-40af-9f01-2b6682b1c44d',
    'f73959f2-8d1c-4998-bae4-85551e2ca445',
    'e66e025f-e07c-4f0d-93ed-3ac609b570d5',
    '6a477149-a7f0-4758-8570-b288a8314fbd',
    '07b0243c-48fc-4eee-a338-c7571cc2df1a',
    '94f79040-68f5-4801-bf41-6f29bc0be8c6',
    'c9e51094-9991-4946-b6a8-6cd19c399173',
    'ac0cdd08-47f4-4776-bd70-8bb512c6563e',
    'b9134038-96b5-434b-8456-963caac4c6db',
    '0b3dd699-d4d3-4295-8ec2-502e6c41d8d7',
    '2b781259-3ff4-44fa-bba1-e2b674548e6c',
    '8fda9c6e-3e5d-474a-99d2-07eeae12f768',
    'ca358d94-47cb-4aa1-8565-9d4280f286fb',
    '6c69e870-8def-432d-8fbe-dc0da610635e',
    '53fd76c8-6745-414e-adbe-62ff72011fc5',
    'f6db6471-03c5-44b6-a463-4976d8fc6350',
    'a6c392e5-cbef-469d-a151-4f54c73b5fb3',
    '1985e363-b6da-47ec-8c92-dabcd587e6b6',
    '389f685b-f727-41ad-adf5-c72365223ab6',
    '031a89ca-ed61-407b-91f6-a07092b48214',
    '79a135da-2f10-4dc7-8424-f49dad0cf24c',
    '1c4ff086-8435-4572-a6af-898b73852711',
    '09027102-ca0f-44d1-94cd-3ebd6af379ec',
    '808e4748-b080-4989-89ba-003a2b8b76bf',
    'f7867764-967a-4c61-a680-3c5741340bf3',
    'c48b26c3-a7aa-45ad-829c-1967ddd41be2',
    'ceb17697-5bd1-4ada-8201-cf875be1b8dd',
    '09eb47e8-1683-485b-84ae-9cef53ca6981',
    '0c742f04-7723-49b4-8b5b-290856e508c3',
    'dfbe2ba6-7f48-4309-989b-0c65e5cb2788',
    '8902fe1d-a7a9-4046-9a35-244475e113fd',
    '00d1d1e0-4b46-4af1-ba91-4f15f23d55cb',
    '66945b11-520c-4a1e-b76c-e09e36cb7a02',
    '66b2f4d2-ecea-4eee-9868-8e8c41d76efa',
    '4019997c-8d8e-4e21-8caa-26458c743b24',
    '8d157b6b-8f13-4bbf-9b88-e1fbf6844749',
    'c24bf8dd-fa9c-4e4e-98ed-83a1713f3276',
    '6b21e796-e4bd-410c-990f-31698edd7275',
    '18651608-70f6-4725-8084-aa51833367a9',
    'ad73394e-f797-4a85-aaff-b69a9a1700e0',
    '8d68226a-47c7-4c25-a1a3-95dca2b6cc1a',
    'ca3fb362-a24e-4c79-a84c-b61f60542a38',
    '7cd8067f-67b2-4934-9d07-4da82109f9e4',
    'c2bbc543-f2de-442a-9fc0-13d2e6332aba',
    '996bcad4-fa2c-4ccd-bb2a-918ddb323d0c',
    'c05817ac-8e5f-406d-995f-3826e117207c',
    'c212be24-0858-4c8e-9da9-6730f6352617',
    '6cae032f-4be2-4d2f-a136-42d096a659d8',
    '12791f33-5f01-4cf5-bf99-3f9fde75077a',
    '57bde55e-64bb-404d-b3c1-60e3ec50f46a',
    'd5606eac-b0d3-48d8-9c59-1e18d8ecc032',
    '51c965df-d267-4e26-95f0-878ca2dede2c',
]
snapshot_id_list = [
    'c195ed64-842d-4525-8a1c-9083eccaafa7',
    '84103748-39eb-45f9-b4ca-e23a9e52d0cc',
    '96cfd9ec-5eb4-48f5-9284-84f349701033',
    '8f104c7f-8b5a-489e-95b4-616130405e7c',
    'a2bb366f-029d-4bef-8da7-dce818743881',
    'f2f0d8a3-6e18-436a-bc51-ff742d30d6a4',
    '2b2a1f74-9c2e-4c6c-840b-80f466d1e209',
    '3442e75a-7452-4680-9ce5-70fa21363083',
    '79509151-b96a-483f-a6b3-eeede54467d1',
    '887dd90c-6742-4283-92b2-bc9ed6bc2ae1',
    'ec292668-bc78-45f7-b601-8d452b038e6c',
    '432c6422-ceaf-4c6c-bd8d-7c90771a284f',
    '8d055301-47e0-4384-9746-8bc1b93d9a96',
    '79502d0c-bc1c-4d51-a6de-eb0334b3b660',
]

# Specify a mapping from phs-consent to DOID for DS consent codes (replace "_" with "-" in consent first)
ds_consent_map = {
    'phs000298:DS-ASD': 'http://purl.obolibrary.org/obo/DOID_0060041',
    'phs000693:DS-BDIS': 'http://purl.obolibrary.org/obo/DOID_936',
    'phs000693:DS-EP': 'http://purl.obolibrary.org/obo/DOID_1826',
    'phs000744:DS-RD': 'http://purl.obolibrary.org/obo/DOID_15',
    'phs000744:DS-THAL-IRB': 'http://purl.obolibrary.org/obo/DOID_10241',
    'phs001222:DS-DRC-IRB-NPU': 'http://purl.obolibrary.org/obo/DOID_9351',
    'phs001227:DS-ATHSCL-IRB-MDS': 'http://purl.obolibrary.org/obo/DOID_1936',
    'phs001259:DS-CARD-MDS-GSO': 'http://purl.obolibrary.org/obo/DOID_1287',
    'phs001487:DS-MULTIPLE-DISEASES-IRB-COL-NPU-RD': 'http://purl.obolibrary.org/obo/DOID_1287',
    'phs001489:DS-EAED-IRB-NPU-MDS': 'http://purl.obolibrary.org/obo/DOID_1826',
    'phs001489:DS-EAED-MDS': 'http://purl.obolibrary.org/obo/DOID_1826',
    'phs001489:DS-EARET-MDS': 'http://purl.obolibrary.org/obo/DOID_1826',
    'phs001489:DS-EP': 'http://purl.obolibrary.org/obo/DOID_1826',
    'phs001489:DS-EP-MDS': 'http://purl.obolibrary.org/obo/DOID_1826',
    'phs001489:DS-EPASM-MDS-RD': 'http://purl.obolibrary.org/obo/DOID_1826',
    'phs001489:DS-EPI-ADULT-NPU-MDS': 'http://purl.obolibrary.org/obo/DOID_1826',
    'phs001489:DS-EPI-MULTI-MDS': 'http://purl.obolibrary.org/obo/DOID_1826',
    'phs001489:DS-EPASM-MDS': 'http://purl.obolibrary.org/obo/DOID_1826',
    'phs001489:DS-EP-NPU': 'http://purl.obolibrary.org/obo/DOID_1826',
    'phs001489:DS-EPCOM-MDS-RD': 'http://purl.obolibrary.org/obo/DOID_1826',
    'phs001489:DS-EPSBA-MDS-RD': 'http://purl.obolibrary.org/obo/DOID_1826',
    'phs001489:DS-EPSBACID-MDS-RD': 'http://purl.obolibrary.org/obo/DOID_1826',
    'phs001489:DS-EPSBACID-NPU-MDS-RD': 'http://purl.obolibrary.org/obo/DOID_1826',
    'phs001489:DS-EPSBAID-MDS-RD': 'http://purl.obolibrary.org/obo/DOID_1826',
    'phs001489:DS-MBND-NPU-MDS': 'http://purl.obolibrary.org/obo/DOID_150',
    'phs001489:DS-NSD-ADULTS-NPU-MDS': 'http://purl.obolibrary.org/obo/DOID_863',
    'phs001489:DS-NSD-NPU-MDS': 'http://purl.obolibrary.org/obo/DOID_863',
    'phs001506:DS-CVD-IRB': 'http://purl.obolibrary.org/obo/DOID_1287',
    'phs001592:DS-CVD': 'http://purl.obolibrary.org/obo/DOID_1287',
    'phs001642:DS-GR-IRB-MDS': 'http://purl.obolibrary.org/obo/DOID_77',
    'phs001642:DS-DSDI-MDS': 'http://purl.obolibrary.org/obo/DOID_77',
    'phs001642:DS-GID': 'http://purl.obolibrary.org/obo/DOID_77',
    'phs001642:DS-IBD': 'http://purl.obolibrary.org/obo/DOID_0050589',
    'phs001642:DS-IBD-MDS': 'http://purl.obolibrary.org/obo/DOID_0050589',
    'phs001676:DS-AONDD-IRB': 'http://purl.obolibrary.org/obo/DOID_0060041',
    'phs001740:DS-ASD-RD-IRB': 'http://purl.obolibrary.org/obo/DOID_0060041',
    'phs001741:DS-ASD-IRB': 'http://purl.obolibrary.org/obo/DOID_0060041',
    'phs001766:DS-ASD': 'http://purl.obolibrary.org/obo/DOID_0060041',
    'phs001766:DS-ASD-IRB': 'http://purl.obolibrary.org/obo/DOID_0060041',
    'phs001871:DS-CAD-IRB': 'http://purl.obolibrary.org/obo/DOID_3393',
    'phs001894:DS-EAC-PUB-GSO': 'http://purl.obolibrary.org/obo/DOID_1826',
    'phs001901:DS-CVD-MDS': 'http://purl.obolibrary.org/obo/DOID_1287',
    'phs002004:DS-AUT': 'http://purl.obolibrary.org/obo/DOID_0060041',
    'phs002032:DS-SMA-MDS': 'http://purl.obolibrary.org/obo/DOID_12377',
    'phs002032:DS-MBND-MDS': 'http://purl.obolibrary.org/obo/DOID_150',
    'phs002041:DS-MLHLTH-MDS': 'http://purl.obolibrary.org/obo/DOID_150',
    'phs002041:DS-SZ-MDS': 'http://purl.obolibrary.org/obo/DOID_5419',
    'phs002041:DS-SZRD-MDS': 'http://purl.obolibrary.org/obo/DOID_5419',
    'phs002042:DS-ASD': 'http://purl.obolibrary.org/obo/DOID_0060041',
    'phs002043:DS-AASD': 'http://purl.obolibrary.org/obo/DOID_0060041',
    'phs002044:DS-ASD-IRB': 'http://purl.obolibrary.org/obo/DOID_0060041',
    'phs002206:DS-PEDD-IRB': 'http://purl.obolibrary.org/obo/DOID_4',
    'phs002282:DS-CVDRF': 'http://purl.obolibrary.org/obo/DOID_1287',
    'phs002502:DS-ASD': 'http://purl.obolibrary.org/obo/DOID_0060041',
    'phs002502:DS-ASD-MDS': 'http://purl.obolibrary.org/obo/DOID_0060041',
    'phs002502:DS-ASD-NPU-MDS': 'http://purl.obolibrary.org/obo/DOID_0060041',
    'phs002502:DS-ASD-NPU': 'http://purl.obolibrary.org/obo/DOID_0060041',
    'ph2002502:DS-MLHLTH-IRB-NPU-MDS': 'http://purl.obolibrary.org/obo/DOID_150',
    'ph2002502:DS-MH': 'http://purl.obolibrary.org/obo/DOID_150',
    'phs002502:DS-MBND-MDS': 'http://purl.obolibrary.org/obo/DOID_1289',
    'phs003200:DS-MSC-MDS': ['http://purl.obolibrary.org/obo/DOID_1909', 'http://purl.obolibrary.org/obo/DOID_4159']
}

# Token for use in DUOS (use gcloud auth print-access-token to get this)
duos_token = ""

# DUOS Environment
env = "dev"

#############################################
## Execution
#############################################
dataset_details_records = []
for snapshot_id in snapshot_id_list:
    print(f"Processing snapshot_id: {snapshot_id}...")
    dataset_details = fetch_dataset_details(snapshot_id, ds_consent_map)
    dataset_details_records.append(dataset_details)
output = pd.DataFrame(dataset_details_records)
output_sorted = output.sort_values(by=["studyName", "consentGroups.consentGroupName"], ascending=[True, True], ignore_index=True)

#############################################
## Validation and Output
#############################################
# Create copy of dataframe for unique value validation
output_unique_val = output_sorted.copy()

# Convert study list fields to strings
list_fields = ["dataTypes", "dataCustodianEmail", "nihICsSupportingStudy", "collaboratingSites", "alternativeDataSharingPlanReasons"]
for field in list_fields:
    output_unique_val[field] = [try_join(l) for l in output_unique_val[field]]

# Get unique values per study-level field, by study
study_level_col_list = []
for col in output_unique_val.columns:
    if "consentGroups." not in col and col not in ["studyName", "snapshot_id", "consortium", "consentCode"]:
        study_level_col_list.append(col)
df_unique = output_unique_val.groupby("studyName")[study_level_col_list].nunique()
df_unique["unique_value_validation"] = df_unique.max(axis=1)
df_unique["unique_value_validation"] = ["Pass" if l <= 1 else "Fail" for l in df_unique["unique_value_validation"]]

# Create copy of dataframe for enum validation
output_enum_val = output_sorted.copy()

# Validate enum fields
output_enum_val["studyType"] = [val_study_type_enum(l) for l in output_enum_val["studyType"]]
output_enum_val["nihInstitutionCenterSubmission"] = [val_nih_inst_center_sub_enum(l) for l in output_enum_val["nihInstitutionCenterSubmission"]]
output_enum_val["nihICsSupportingStudy"] = [val_nih_ic_supp_study_enum(l) for l in output_enum_val["nihICsSupportingStudy"]]
output_enum_val["consentGroups.fileTypes.fileType"] = [val_file_type_enum(l) for l in output_enum_val["consentGroups.fileTypes.fileType"]]
study_enum_cols = ["studyType", "nihInstitutionCenterSubmission", "nihICsSupportingStudy"]
df_study_enum = output_enum_val.groupby("studyName")[study_enum_cols].sum()
df_study_enum["study_enum_value_validation"] = df_study_enum.max(axis=1)
df_study_enum["study_enum_value_validation"] = ["Pass" if l < 1 else "Fail" for l in df_study_enum["study_enum_value_validation"]]
consent_group_enum_cols = ["consentGroups.fileTypes.fileType"]
df_consent_group_enum = output_enum_val.groupby("consentGroups.consentGroupName")[consent_group_enum_cols].sum()
df_consent_group_enum["consent_group_enum_value_validation"] = df_consent_group_enum.max(axis=1)
df_consent_group_enum["consent_group_enum_value_validation"] = ["Pass" if l < 1 else "Fail" for l in df_consent_group_enum["consent_group_enum_value_validation"]]

# Join validation dataframes to original dataframe
output_sorted_validated = output_sorted.join(df_unique["unique_value_validation"], on="studyName", how="left")
output_sorted_validated = output_sorted_validated.join(df_study_enum["study_enum_value_validation"], on="studyName", how="left")
output_sorted_validated = output_sorted_validated.join(df_consent_group_enum["consent_group_enum_value_validation"], on="consentGroups.consentGroupName", how="left")

# Display outputs
print("----------------------------------------------------------------------------------------------------")
print("----------------------------------------------------------------------------------------------------")
print("Validated Metadata Output:")
display(output_sorted_validated.style.hide(axis="index"))
print("\n")
print("Unique Study Value Validation Results:")
df_unique.reset_index(inplace=True)
display(df_unique.style.hide(axis="index"))
print("\n")
print("Study Enum Value Validation Results:")
df_study_enum.reset_index(inplace=True)
display(df_study_enum.style.hide(axis="index"))
print("\n")
print("Consent Group Enum Value Validation Results:")
df_consent_group_enum.reset_index(inplace=True)
display(df_consent_group_enum.style.hide(axis="index"))


In [None]:
ds_consent_map = {
    'phs000298:DS-ASD': 'http://purl.obolibrary.org/obo/DOID_0060041',
    'phs000693:DS-BDIS': 'http://purl.obolibrary.org/obo/DOID_936',
    'phs000693:DS-EP': 'http://purl.obolibrary.org/obo/DOID_1826',
    'phs000744:DS-RD': 'http://purl.obolibrary.org/obo/DOID_15',
    'phs000744:DS-THAL-IRB': 'http://purl.obolibrary.org/obo/DOID_10241',
    'phs001222:DS-DRC-IRB-NPU': 'http://purl.obolibrary.org/obo/DOID_9351',
    'phs001227:DS-ATHSCL-IRB-MDS': 'http://purl.obolibrary.org/obo/DOID_1936',
    'phs001259:DS-CARD-MDS-GSO': 'http://purl.obolibrary.org/obo/DOID_1287',
    'phs001487:DS-MULTIPLE-DISEASES-IRB-COL-NPU-RD': 'http://purl.obolibrary.org/obo/DOID_1287',
    'phs001489:DS-EAED-IRB-NPU-MDS': 'http://purl.obolibrary.org/obo/DOID_1826',
    'phs001489:DS-EAED-MDS': 'http://purl.obolibrary.org/obo/DOID_1826',
    'phs001489:DS-EARET-MDS': 'http://purl.obolibrary.org/obo/DOID_1826',
    'phs001489:DS-EP': 'http://purl.obolibrary.org/obo/DOID_1826',
    'phs001489:DS-EP-MDS': 'http://purl.obolibrary.org/obo/DOID_1826',
    'phs001489:DS-EPASM-MDS-RD': 'http://purl.obolibrary.org/obo/DOID_1826',
    'phs001489:DS-EPI-ADULT-NPU-MDS': 'http://purl.obolibrary.org/obo/DOID_1826',
    'phs001489:DS-EPI-MULTI-MDS': 'http://purl.obolibrary.org/obo/DOID_1826',
    'phs001489:DS-EPASM-MDS': 'http://purl.obolibrary.org/obo/DOID_1826',
    'phs001489:DS-EP-NPU': 'http://purl.obolibrary.org/obo/DOID_1826',
    'phs001489:DS-EPCOM-MDS-RD': 'http://purl.obolibrary.org/obo/DOID_1826',
    'phs001489:DS-EPSBA-MDS-RD': 'http://purl.obolibrary.org/obo/DOID_1826',
    'phs001489:DS-EPSBACID-MDS-RD': 'http://purl.obolibrary.org/obo/DOID_1826',
    'phs001489:DS-EPSBACID-NPU-MDS-RD': 'http://purl.obolibrary.org/obo/DOID_1826',
    'phs001489:DS-EPSBAID-MDS-RD': 'http://purl.obolibrary.org/obo/DOID_1826',
    'phs001489:DS-MBND-NPU-MDS': 'http://purl.obolibrary.org/obo/DOID_150',
    'phs001489:DS-NSD-ADULTS-NPU-MDS': 'http://purl.obolibrary.org/obo/DOID_863',
    'phs001489:DS-NSD-NPU-MDS': 'http://purl.obolibrary.org/obo/DOID_863',
    'phs001506:DS-CVD-IRB': 'http://purl.obolibrary.org/obo/DOID_1287',
    'phs001592:DS-CVD': 'http://purl.obolibrary.org/obo/DOID_1287',
    'phs001642:DS-GR-IRB-MDS': 'http://purl.obolibrary.org/obo/DOID_77',
    'phs001642:DS-DSDI-MDS': 'http://purl.obolibrary.org/obo/DOID_77',
    'phs001642:DS-GID': 'http://purl.obolibrary.org/obo/DOID_77',
    'phs001642:DS-IBD': 'http://purl.obolibrary.org/obo/DOID_0050589',
    'phs001642:DS-IBD-MDS': 'http://purl.obolibrary.org/obo/DOID_0050589',
    'phs001676:DS-AONDD-IRB': 'http://purl.obolibrary.org/obo/DOID_0060041',
    'phs001740:DS-ASD-RD-IRB': 'http://purl.obolibrary.org/obo/DOID_0060041',
    'phs001741:DS-ASD-IRB': 'http://purl.obolibrary.org/obo/DOID_0060041',
    'phs001766:DS-ASD': 'http://purl.obolibrary.org/obo/DOID_0060041',
    'phs001766:DS-ASD-IRB': 'http://purl.obolibrary.org/obo/DOID_0060041',
    'phs001871:DS-CAD-IRB': 'http://purl.obolibrary.org/obo/DOID_3393',
    'phs001894:DS-EAC-PUB-GSO': 'http://purl.obolibrary.org/obo/DOID_1826',
    'phs001901:DS-CVD-MDS': 'http://purl.obolibrary.org/obo/DOID_1287',
    'phs002004:DS-AUT': 'http://purl.obolibrary.org/obo/DOID_0060041',
    'phs002032:DS-SMA-MDS': 'http://purl.obolibrary.org/obo/DOID_12377',
    'phs002032:DS-MBND-MDS': 'http://purl.obolibrary.org/obo/DOID_150',
    'phs002041:DS-MLHLTH-MDS': 'http://purl.obolibrary.org/obo/DOID_150',
    'phs002041:DS-SZ-MDS': 'http://purl.obolibrary.org/obo/DOID_5419',
    'phs002041:DS-SZRD-MDS': 'http://purl.obolibrary.org/obo/DOID_5419',
    'phs002042:DS-ASD': 'http://purl.obolibrary.org/obo/DOID_0060041',
    'phs002043:DS-AASD': 'http://purl.obolibrary.org/obo/DOID_0060041',
    'phs002044:DS-ASD-IRB': 'http://purl.obolibrary.org/obo/DOID_0060041',
    'phs002206:DS-PEDD-IRB': 'http://purl.obolibrary.org/obo/DOID_4',
    'phs002282:DS-CVDRF': 'http://purl.obolibrary.org/obo/DOID_1287',
    'phs002502:DS-ASD': 'http://purl.obolibrary.org/obo/DOID_0060041',
    'phs002502:DS-ASD-MDS': 'http://purl.obolibrary.org/obo/DOID_0060041',
    'phs002502:DS-ASD-NPU': 'http://purl.obolibrary.org/obo/DOID_0060041',
    'ph2002502:DS-MLHLTH-IRB-NPU-MDS': 'http://purl.obolibrary.org/obo/DOID_150',
    'ph2002502:DS-MH': 'http://purl.obolibrary.org/obo/DOID_150',
    'phs002502:DS-MBND-MDS': 'http://purl.obolibrary.org/obo/DOID_1289',
    'phs003200:DS-MSC-MDS': ['http://purl.obolibrary.org/obo/DOID_1909', 'http://purl.obolibrary.org/obo/DOID_4159']
}

for key, val in ds_consent_map.items():
    print(key+"|"+val)

In [None]:
def coalesce(*arg): 
    remove_list = ["", "NA", "N/A", "NONE", "TBD", "UNKNOWN", "UNSPECIFIED"]
    # update to remove N/A, None, Null, TBD
    for input_item in arg:
        if input_item is False or input_item == []:
            return input_item
        elif input_item:
            if isinstance(input_item, list):
                temp_list = [ele for ele in input_item if ele is not None and ele.upper() not in remove_list]
                if temp_list:
                    return temp_list
                else:
                    return []
            else:
                if str(input_item).upper() not in remove_list:
                    return input_item
    return None

def format_description(input_string):
    output_string = input_string if input_string else ""
    output_string = re.sub("\n\n\t", " ", output_string)
    output_string = re.sub("\t", " ", output_string)
    output_string = re.sub("study.cgi\?study_id=|.\/study.cgi\?study_id=", "https://www.ncbi.nlm.nih.gov/projects/gap/cgi-bin/study.cgi?study_id=", output_string)
    return output_string

def format_phs_id(input_str):
    try:
        num = re.search("phs0*([0-9]+)", input_str, re.IGNORECASE).group(1)
    except:
        num = ""
    if num:
        output_str = "phs" + str(num).zfill(6)
    else:
        output_str = ""
    return output_str

def try_join(l):
    try:
        if isinstance(l, list):
            return ', '.join(map(str, l))
        else:
            return l
    except TypeError:
        return l
    
def val_study_type_enum(l):
    if l and l not in ["Observational", "Interventional", "Descriptive", "Analytical", "Prospective", "Retrospective", "Case report", "Case series", "Cross-sectional", "Cohort study"]:
        return 1
    else:
        return 0

def val_nih_inst_center_sub_enum(l):
    if l and l not in ["NCI", "NEI", "NHLBI", "NHGRI", "NIA", "NIAAA", "NIAID", "NIAMS", "NIBIB", "NICHD", "NIDCD", "NIDCR", "NIDDK", "NIDA", "NIEHS", "NIGMS", "NIMH", "NIMHD", "NINDS", "NINR", "NLM", "CC", "CIT", "CSR", "FIC", "NCATS", "NCCIH"]:
        return 1
    else:
        return 0

def val_nih_ic_supp_study_enum(l):
    if l and isinstance(l, list):
        for item in l:
            if item not in ["NCI", "NEI", "NHLBI", "NHGRI", "NIA", "NIAAA", "NIAID", "NIAMS", "NIBIB", "NICHD", "NIDCD", "NIDCR", "NIDDK", "NIDA", "NIEHS", "NIGMS", "NIMH", "NIMHD", "NINDS", "NINR", "NLM", "CC", "CIT", "CSR", "FIC", "NCATS", "NCCIH"]:
                return 1
        return 0
    else:
        return 0

def val_file_type_enum(l):
    if l and isinstance(l, list):
        for item in l:
            if item not in ["Arrays", "Genome", "Exome", "Survey", "Phenotype"]:
                return 1
        return 0
    else:
        return 0

In [None]:
# Snapshot list
snapshot_id_list = [
    'e2736891-a569-449e-8cbf-b7d0274b64d0',
#     'a2bb366f-029d-4bef-8da7-dce818743881',
#     'f2f0d8a3-6e18-436a-bc51-ff742d30d6a4',
#     '2b2a1f74-9c2e-4c6c-840b-80f466d1e209',
#     '3442e75a-7452-4680-9ce5-70fa21363083',
#     '79509151-b96a-483f-a6b3-eeede54467d1',
#     '887dd90c-6742-4283-92b2-bc9ed6bc2ae1',
#     'ec292668-bc78-45f7-b601-8d452b038e6c',
#     '432c6422-ceaf-4c6c-bd8d-7c90771a284f',
#     '8d055301-47e0-4384-9746-8bc1b93d9a96',
#     '79502d0c-bc1c-4d51-a6de-eb0334b3b660',
]

# Specify a mapping from phs-consent to DOID for DS consent codes (replace "_" with "-" in consent first)
ds_consent_map = {
    'phs000298:DS-ASD': 'http://purl.obolibrary.org/obo/DOID_0060041',
    'phs000693:DS-BDIS': 'http://purl.obolibrary.org/obo/DOID_936',
    'phs000693:DS-EP': 'http://purl.obolibrary.org/obo/DOID_1826',
    'phs000744:DS-RD': 'http://purl.obolibrary.org/obo/DOID_15',
    'phs000744:DS-THAL-IRB': 'http://purl.obolibrary.org/obo/DOID_10241',
    'phs001222:DS-DRC-IRB-NPU': 'http://purl.obolibrary.org/obo/DOID_9351',
    'phs001227:DS-ATHSCL-IRB-MDS': 'http://purl.obolibrary.org/obo/DOID_1936',
    'phs001259:DS-CARD-MDS-GSO': 'http://purl.obolibrary.org/obo/DOID_1287',
    'phs001487:DS-MULTIPLE-DISEASES-IRB-COL-NPU-RD': 'http://purl.obolibrary.org/obo/DOID_1287',
    'phs001489:DS-EAED-IRB-NPU-MDS': 'http://purl.obolibrary.org/obo/DOID_1826',
    'phs001489:DS-EAED-MDS': 'http://purl.obolibrary.org/obo/DOID_1826',
    'phs001489:DS-EARET-MDS': 'http://purl.obolibrary.org/obo/DOID_1826',
    'phs001489:DS-EP': 'http://purl.obolibrary.org/obo/DOID_1826',
    'phs001489:DS-EP-MDS': 'http://purl.obolibrary.org/obo/DOID_1826',
    'phs001489:DS-EPASM-MDS-RD': 'http://purl.obolibrary.org/obo/DOID_1826',
    'phs001489:DS-EPI-ADULT-NPU-MDS': 'http://purl.obolibrary.org/obo/DOID_1826',
    'phs001489:DS-EPI-MULTI-MDS': 'http://purl.obolibrary.org/obo/DOID_1826',
    'phs001489:DS-EPASM-MDS': 'http://purl.obolibrary.org/obo/DOID_1826',
    'phs001489:DS-EP-NPU': 'http://purl.obolibrary.org/obo/DOID_1826',
    'phs001489:DS-EPCOM-MDS-RD': 'http://purl.obolibrary.org/obo/DOID_1826',
    'phs001489:DS-EPSBA-MDS-RD': 'http://purl.obolibrary.org/obo/DOID_1826',
    'phs001489:DS-EPSBACID-MDS-RD': 'http://purl.obolibrary.org/obo/DOID_1826',
    'phs001489:DS-EPSBACID-NPU-MDS-RD': 'http://purl.obolibrary.org/obo/DOID_1826',
    'phs001489:DS-EPSBAID-MDS-RD': 'http://purl.obolibrary.org/obo/DOID_1826',
    'phs001489:DS-MBND-NPU-MDS': 'http://purl.obolibrary.org/obo/DOID_150',
    'phs001489:DS-NSD-ADULTS-NPU-MDS': 'http://purl.obolibrary.org/obo/DOID_863',
    'phs001489:DS-NSD-NPU-MDS': 'http://purl.obolibrary.org/obo/DOID_863',
    'phs001506:DS-CVD-IRB': 'http://purl.obolibrary.org/obo/DOID_1287',
    'phs001592:DS-CVD': 'http://purl.obolibrary.org/obo/DOID_1287',
    'phs001642:DS-GR-IRB-MDS': 'http://purl.obolibrary.org/obo/DOID_77',
    'phs001642:DS-DSDI-MDS': 'http://purl.obolibrary.org/obo/DOID_77',
    'phs001642:DS-GID': 'http://purl.obolibrary.org/obo/DOID_77',
    'phs001642:DS-IBD': 'http://purl.obolibrary.org/obo/DOID_0050589',
    'phs001642:DS-IBD-MDS': 'http://purl.obolibrary.org/obo/DOID_0050589',
    'phs001676:DS-AONDD-IRB': 'http://purl.obolibrary.org/obo/DOID_0060041',
    'phs001740:DS-ASD-RD-IRB': 'http://purl.obolibrary.org/obo/DOID_0060041',
    'phs001741:DS-ASD-IRB': 'http://purl.obolibrary.org/obo/DOID_0060041',
    'phs001766:DS-ASD': 'http://purl.obolibrary.org/obo/DOID_0060041',
    'phs001766:DS-ASD-IRB': 'http://purl.obolibrary.org/obo/DOID_0060041',
    'phs001871:DS-CAD-IRB': 'http://purl.obolibrary.org/obo/DOID_3393',
    'phs001894:DS-EAC-PUB-GSO': 'http://purl.obolibrary.org/obo/DOID_1826',
    'phs001901:DS-CVD-MDS': 'http://purl.obolibrary.org/obo/DOID_1287',
    'phs002004:DS-AUT': 'http://purl.obolibrary.org/obo/DOID_0060041',
    'phs002032:DS-SMA-MDS': 'http://purl.obolibrary.org/obo/DOID_12377',
    'phs002032:DS-MBND-MDS': 'http://purl.obolibrary.org/obo/DOID_150',
    'phs002041:DS-MLHLTH-MDS': 'http://purl.obolibrary.org/obo/DOID_150',
    'phs002041:DS-SZ-MDS': 'http://purl.obolibrary.org/obo/DOID_5419',
    'phs002041:DS-SZRD-MDS': 'http://purl.obolibrary.org/obo/DOID_5419',
    'phs002042:DS-ASD': 'http://purl.obolibrary.org/obo/DOID_0060041',
    'phs002043:DS-AASD': 'http://purl.obolibrary.org/obo/DOID_0060041',
    'phs002044:DS-ASD-IRB': 'http://purl.obolibrary.org/obo/DOID_0060041',
    'phs002206:DS-PEDD-IRB': 'http://purl.obolibrary.org/obo/DOID_4',
    'phs002282:DS-CVDRF': 'http://purl.obolibrary.org/obo/DOID_1287',
    'phs002502:DS-ASD': 'http://purl.obolibrary.org/obo/DOID_0060041',
    'phs002502:DS-ASD-MDS': 'http://purl.obolibrary.org/obo/DOID_0060041',
    'phs002502:DS-ASD-NPU-MDS': 'http://purl.obolibrary.org/obo/DOID_0060041',
    'phs002502:DS-ASD-NPU': 'http://purl.obolibrary.org/obo/DOID_0060041',
    'ph2002502:DS-MLHLTH-IRB-NPU-MDS': 'http://purl.obolibrary.org/obo/DOID_150',
    'ph2002502:DS-MH': 'http://purl.obolibrary.org/obo/DOID_150',
    'phs002502:DS-MBND-MDS': 'http://purl.obolibrary.org/obo/DOID_1289',
    'phs003200:DS-MSC-MDS': ['http://purl.obolibrary.org/obo/DOID_1909', 'http://purl.obolibrary.org/obo/DOID_4159']
}

# Token for use in DUOS (use gcloud auth print-access-token to get this)
duos_token = ""

# DUOS Environment
duos_env = "dev"


In [None]:
# Initialize variables
dataset_details_records = []
terra_dict = {}
dbgap_xml_dict = {}
dbgap_study_api_dict = {}
dbgap_fhir_dict = {}
final_results_dict = {}

# Determine the DUOS URL from the duos_env variable
if duos_env == "prod":
    url = "https://consent.dsde-prod.broadinstitute.org"
else:
    url = "https://consent.dsde-dev.broadinstitute.org"

# Build DUOS lookups
print(f"Building DUOS dataset and study lookups...")
study_lookup = {}
dataset_lookup = []
datasets = requests.get(
    url=f"{url}/api/dataset/v3",
    headers={"Authorization": f"Bearer {duos_token}"}
).json()
study_ids_processed = set()
for dataset_entry in datasets:
    datasets_parsed += 1
    dataset_id = dataset_entry.get("dataset_id")
    dataset_name = dataset_entry.get("dataset_name")
    identifier = dataset_entry.get("identifier")
    study_id = dataset_entry.get("study_id")
    try:
        consent_group_name = re.search(r'(.*)_[0-9]{8}_ANV[0-9]+_[0-9]{12}$', dataset_name).group(1)
    except:
        consent_group_name = dataset_name
    if study_id:
        # Build dataset lookup
        dataset_lookup.append({
            "dataset_id": dataset_id,
            "current_consent_group_name": dataset_name, 
            "consent_group_name": consent_group_name,
            "identifier": identifier,
            "study_id": study_id
        })
        # Build study lookup
        if study_id not in study_ids_processed:
            study_ids_processed.add(study_id)
            study_details = requests.get(
                url=f"{url}/api/dataset/registration/{identifier}",
                headers={"Authorization": f"Bearer {duos_token}"}
            ).json()
            study_desc = study_details.get("studyDescription")
            if study_desc and "Platform: AnVIL" in study_desc:
                study_phs = study_details.get("dbGaPPhsID")
                if study_phs:
                    id_in_lookup = study_lookup.get(study_phs)
                    if id_in_lookup and id_in_lookup != study_id:
                        print(f"Warning: PHS ID {study_phs} tied to multiple studies in DUOS: {id_in_lookup}, {study_id}. Please review.")
                    else:
                        study_lookup[study_phs] = study_id

# Loop through and process snapshots
for snapshot_id in snapshot_id_list:

    # Retrieve snapshot details
    print(f"Processing snapshot_id: {snapshot_id}...")
    final_results_dict = {}
    api_client = refresh_tdr_api_client()
    datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
    snapshots_api = data_repo_client.SnapshotsApi(api_client=api_client)
    attempt_counter = 0
    snapshot_details = {}
    while attempt_counter <= 2:
        try:
            snapshot_details = snapshots_api.retrieve_snapshot(id=snapshot_id).to_dict()
            break
        except:
            sleep(5)
            attempt_counter += 1  
    snapshot_name = snapshot_details["name"]
    try:
        consent_group_name = re.search(r'(.*)_[0-9]{8}_ANV[0-9]+_[0-9]{12}$', snapshot_name).group(1)
    except:
        consent_group_name = snapshot_name
    dataset_id = snapshot_details["source"][0]["dataset"]["id"]
    phs_id = format_phs_id(snapshot_details["source"][0]["dataset"]["phs_id"])
    if snapshot_details["source"][0]["dataset"]["secure_monitoring_enabled"] == True:
        access_management = "controlled"
    else:
        access_management = "open"
    if snapshot_details["source"][0]["dataset_properties"].get("source_workspaces"):  
        source_workspace = snapshot_details["source"][0]["dataset_properties"]["source_workspaces"][0]
    else:
        source_workspace = None
    if snapshot_details["source"][0]["dataset_properties"].get("consent_name"):
        snapshot_consent_code = snapshot_details["source"][0]["dataset_properties"]["consent_name"]
    else:
        snapshot_consent_code = None
    if snapshot_details["duos_firecloud_group"] != None:
        duos_id = snapshot_details["duos_firecloud_group"]["duos_id"]
    else:
        duos_id = None
    duos_id = "DUOS-000774" # REMOVE THIS
    print("\tSnapshot PHS_ID: " + str(phs_id))
    print("\tSnapshot Consent Code: " + str(snapshot_consent_code))
    print("\tSource Workspace: " + str(source_workspace))
    print("\tDUOS ID: " + str(duos_id))
    
    # Attempt to match to a DUOS ID based on consent group name and DUOS Study based on PHS ID
    match_duos_id = ""
    for dataset in dataset_lookup:
        if dataset["consent_group_name"] == consent_group_name:
            match_duos_id = dataset["identifier"]
            break
    match_study_id = ""
    if phs_id:
        match_study_id = study_lookup.get(phs_id)
    
    # If a snapshot or match DUOS ID is present, use this to build the final result dictionary
    if duos_id or match_duos_id:
        
        duos_id_to_use = coalesce(duos_id, match_duos_id)
        
        # Pull existing DUOS study registration
        duos_dict = requests.get(
            url=f"{url}/api/dataset/registration/{duos_id_to_use}",
            headers={"Authorization": f"Bearer {duos_token}"}
        ).json()
        #print(duos_dict)

        # Pull dataset details from DUOS (to get data use info) 
        duos_dataset_id = duos_dict["consentGroups"][0].get("datasetId")
        duos_data_use_dict = {}
        if duos_dataset_id:
            dataset_details = requests.get(
                url=f"{url}/api/dataset/v2/{duos_dataset_id}",
                headers={"Authorization": f"Bearer {duos_token}"}
            ).json()
            duos_data_use_dict = dataset_details.get("dataUse")
        duos_data_use_dict = {} # REMOVE THIS
        
        # Build final results dictionary
        if snapshot_consent_code:
            consent_code = snapshot_consent_code.upper().replace("_", "-")
        else:
            consent_code = ""
        final_results_dict["snapshot_id"] = snapshot_id
        final_results_dict["snapshot_duos_id"] = duos_id
        final_results_dict["match_duos_id"] = match_duos_id
        final_results_dict["match_study_id"] = match_study_id
        final_results_dict["studyName"] = duos_dict.get("studyName")
        final_results_dict["studyType"] = duos_dict.get("studyType")
        final_results_dict["studyDescription"] = duos_dict.get("studyDescription")
        final_results_dict["dataTypes"] = duos_dict.get("dataTypes")
        final_results_dict["phenotypeIndication"] = duos_dict.get("phenotypeIndication")
        final_results_dict["species"] = duos_dict.get("species")
        final_results_dict["piName"] = duos_dict.get("piName")
        final_results_dict["dataCustodianEmail"] = duos_dict.get("dataCustodianEmail")
        final_results_dict["publicVisibility"] = duos_dict.get("publicVisibility")
        final_results_dict["nihAnvilUse"] = "I am NHGRI funded and I have a dbGaP PHS ID already" if 'already' in duos_dict.get("nihAnvilUse").lower() else "I am NHGRI funded and I do not have a dbGaP PHS ID"
        final_results_dict["submittingToAnvil"] = duos_dict.get("submittingToAnvil")
        final_results_dict["dbGaPPhsID"] = duos_dict.get("dbGaPPhsID")
        final_results_dict["dbGaPStudyRegistrationName"] = duos_dict.get("dbGaPStudyRegistrationName")
        final_results_dict["embargoReleaseDate"] = duos_dict.get("embargoReleaseDate")
        final_results_dict["sequencingCenter"] = duos_dict.get("sequencingCenter")
        final_results_dict["piEmail"] = duos_dict.get("piEmail")
        final_results_dict["piInstitution"] = duos_dict.get("piInstitution")
        final_results_dict["nihGrantContractNumber"] = duos_dict.get("nihGrantContractNumber")
        final_results_dict["nihICsSupportingStudy"] = duos_dict.get("nihICsSupportingStudy")
        final_results_dict["nihProgramOfficerName"] = duos_dict.get("nihProgramOfficerName")
        final_results_dict["nihInstitutionCenterSubmission"] = duos_dict.get("nihInstitutionCenterSubmission")
        final_results_dict["nihInstitutionalCertificationFileName"] = duos_dict.get("nihInstitutionalCertificationFileName")
        final_results_dict["nihGenomicProgramAdministratorName"] = duos_dict.get("nihGenomicProgramAdministratorName")
        final_results_dict["multiCenterStudy"] = duos_dict.get("multiCenterStudy")
        final_results_dict["collaboratingSites"] = duos_dict.get("collaboratingSites")
        final_results_dict["controlledAccessRequiredForGenomicSummaryResultsGSR"] = duos_dict.get("controlledAccessRequiredForGenomicSummaryResultsGSR")
        final_results_dict["controlledAccessRequiredForGenomicSummaryResultsGSRRequiredExplanation"] = duos_dict.get("controlledAccessRequiredForGenomicSummaryResultsGSRRequiredExplanation")
        final_results_dict["alternativeDataSharingPlan"] = duos_dict.get("alternativeDataSharingPlan")
        final_results_dict["alternativeDataSharingPlanReasons"] = duos_dict.get("alternativeDataSharingPlanReasons")
        final_results_dict["alternativeDataSharingPlanExplanation"] = duos_dict.get("alternativeDataSharingPlanExplanation")
        final_results_dict["alternativeDataSharingPlanFileName"] = duos_dict.get("alternativeDataSharingPlanFileName")
        final_results_dict["alternativeDataSharingPlanDataSubmitted"] = duos_dict.get("alternativeDataSharingPlanDataSubmitted")
        final_results_dict["alternativeDataSharingPlanDataReleased"] = duos_dict.get("alternativeDataSharingPlanDataReleased")
        final_results_dict["alternativeDataSharingPlanTargetDeliveryDate"] = duos_dict.get("alternativeDataSharingPlanTargetDeliveryDate")
        final_results_dict["alternativeDataSharingPlanTargetPublicReleaseDate"] = duos_dict.get("alternativeDataSharingPlanTargetPublicReleaseDate")
        final_results_dict["alternativeDataSharingPlanAccessManagement"] = duos_dict.get("alternativeDataSharingPlanAccessManagement")
        final_results_dict["consentGroups.consentGroupName"] = consent_group_name
        final_results_dict["consentGroups.accessManagement"] = access_management
        final_results_dict["consentGroups.numberOfParticipants"] = duos_dict["consentGroups"][0].get("numberOfParticipants")
        final_results_dict["consentCode"] = consent_code
        final_results_dict["consentGroups.generalResearchUse"] = coalesce(duos_dict["consentGroups"][0].get("generalResearchUse"), duos_data_use_dict.get("generalUse"), False)
        final_results_dict["consentGroups.hmb"] = coalesce(duos_dict["consentGroups"][0].get("hmb"), duos_data_use_dict.get("hmbResearch"), False)
        final_results_dict["consentGroups.diseaseSpecificUse"] = coalesce(duos_dict["consentGroups"][0].get("diseaseSpecificUse"), duos_data_use_dict.get("diseaseRestrictions"), [])
        final_results_dict["consentGroups.gs"] = coalesce(duos_dict["consentGroups"][0].get("gs"), duos_data_use_dict.get("geographicalRestrictions"))
        final_results_dict["consentGroups.poa"] = coalesce(duos_dict["consentGroups"][0].get("poa"), duos_data_use_dict.get("populationOriginsAncestry"), False)
        final_results_dict["consentGroups.nmds"] = coalesce(duos_dict["consentGroups"][0].get("nmds"), False)
        final_results_dict["consentGroups.gso"] = coalesce(duos_dict["consentGroups"][0].get("gso"), duos_data_use_dict.get("geneticStudiesOnly"), False)
        final_results_dict["consentGroups.pub"] = coalesce(duos_dict["consentGroups"][0].get("pub"), duos_data_use_dict.get("publicationResults"), False)
        final_results_dict["consentGroups.col"] = coalesce(duos_dict["consentGroups"][0].get("col"), duos_data_use_dict.get("collaboratorRequired"), False)
        final_results_dict["consentGroups.irb"] = coalesce(duos_dict["consentGroups"][0].get("irb"), duos_data_use_dict.get("ethicsApprovalRequired"), False)
        final_results_dict["consentGroups.npu"] = coalesce(duos_dict["consentGroups"][0].get("npu"), False)
        final_results_dict["consentGroups.otherPrimary"] = duos_dict["consentGroups"][0].get("otherPrimary")
        final_results_dict["consentGroups.otherSecondary"] = duos_dict["consentGroups"][0].get("otherSecondary")
        final_results_dict["consentGroups.mor"] = duos_dict["consentGroups"][0].get("mor")
        final_results_dict["consentGroups.morDate"] = duos_dict["consentGroups"][0].get("morDate")
        final_results_dict["consentGroups.dataLocation"] = "TDR Location"
        final_results_dict["consentGroups.url"] = "https://data.terra.bio/snapshots/" + snapshot_id
        if duos_dict["consentGroups"][0]["fileTypes"] and duos_dict["consentGroups"][0]["fileTypes"].get("fileType"):
            final_results_dict["consentGroups.fileTypes.fileType"] = duos_dict["consentGroups"][0]["fileTypes"][0].get("fileType")
        else:
            final_results_dict["consentGroups.fileTypes.fileType"] = None
        if duos_dict["consentGroups"][0]["fileTypes"] and duos_dict["consentGroups"][0]["fileTypes"].get("functionalEquivalence"):
            final_results_dict["consentGroups.fileTypes.functionalEquivalence"] = duos_dict["consentGroups"][0]["fileTypes"][0].get("functionalEquivalence")
        else:
            final_results_dict["consentGroups.fileTypes.functionalEquivalence"] = None
        collab_site = duos_dict.get("collaboratingSites")
        if collab_site:
            final_results_dict["consortium"] = collab_site[0]
        else:
            final_results_dict["consortium"] = None
        dataset_details_records.append(final_results_dict)
        continue
        
    # Pull study information from DUOS (if matched to existing study)
    duos_study_dict = {}
    if match_study_id:
        duos_study_dict = requests.get(
                url=f"{url}/api/dataset/study/registration/{match_study_id}",
                headers={"Authorization": f"Bearer {duos_token}"}
            ).json()
        collab_site = duos_study_dict.get("collaboratingSites")
        if collab_site:
            duos_study_dict["consortium"] = collab_site[0]
    
    # Pull information from original workspace (if listed)
    if source_workspace:
        # Establish credentials
        creds, project = google.auth.default()
        auth_req = google.auth.transport.requests.Request()
        creds.refresh(auth_req)

        # Pull workspace attributes
        attempt_counter = 0
        while attempt_counter <= 2:
            try:
                ws_attributes = requests.get(
                    url=f"https://api.firecloud.org/api/workspaces/anvil-datastorage/{source_workspace}?fields=workspace.attributes,workspace.authorizationDomain,workspace.googleProject,workspace.bucketName",
                    headers={"Authorization": f"Bearer {creds.token}"}
                ).json()
                break
            except:
                sleep(5)
                attempt_counter += 1

        # Map to schema
        if ws_attributes.get("workspace"):
            terra_dict["studyName"] = coalesce(ws_attributes["workspace"]["attributes"].get("library:projectName"), source_workspace) 
            terra_dict["studyType"] = ws_attributes["workspace"]["attributes"].get("library:studyDesign")
            terra_dict["studyDescription"] = ws_attributes["workspace"]["attributes"].get("description")
            if ws_attributes["workspace"]["attributes"].get("library:dataCategory"):
                terra_dict["dataTypes"] = []
                for item in ws_attributes["workspace"]["attributes"]["library:dataCategory"]["items"]:
                    inner_list = item.split(",")
                    for inner_item in inner_list:
                        inner_item = inner_item.replace("'", "").strip()
                        terra_dict["dataTypes"].append(inner_item)
            terra_dict["phenotypeIndication"] = ws_attributes["workspace"]["attributes"].get("library:indication")
            terra_dict["species"] = "Homo sapiens"
            terra_dict["piName"] = ws_attributes["workspace"]["attributes"].get("library:datasetOwner")
            terra_dict["dataCustodianEmail"] = [ws_attributes["workspace"]["attributes"].get("library:contactEmail")]
            if ws_attributes["workspace"]["attributes"].get("tag:tags"):
                for tag in ws_attributes["workspace"]["attributes"].get("tag:tags")["items"]:
                    if "Consortium:" in tag:
                        terra_dict["consortium"] = tag.split(":")[1].strip()
                    elif "dbGaP:" in tag:
                        terra_dict["dbGaPPhsID"] = format_phs_id(tag.split(":")[1].strip())
                        if not phs_id:
                            phs_id = format_phs_id(tag.split(":")[1].strip()) 
            terra_dict["consentGroups.consentCode"] = ws_attributes["workspace"]["attributes"].get("library:dataUseRestriction")
            if ws_attributes["workspace"]["attributes"].get("library:datatype"):
                terra_dict["consentGroups.fileTypes.fileType"] = ws_attributes["workspace"]["attributes"]["library:datatype"]["items"]
            if ws_attributes["workspace"]["attributes"].get("library:numSubjects"):
                terra_dict["consentGroups.numberOfParticipants"] = ws_attributes["workspace"]["attributes"]["library:numSubjects"]
    #         print("------------------------------------------------------")
    #         print("terra_dict")
    #         print(terra_dict)

    # Pull information from dbGaP (if phs_id listed)
    #     print("PHS ID for dbGaP: " + phs_id)
    if phs_id:
        # Pull and parse XML
        phs_short = phs_id.replace("phs", "")
        dbgap_url = "https://dbgap.ncbi.nlm.nih.gov/ss/dbgapssws.cgi?request=Study&phs=" + phs_short
        attempt_counter = 0
        while attempt_counter <= 2:
            try:
                response = requests.get(url=dbgap_url)
                xml_data = xmltodict.parse(response.text)
                break
            except:
                sleep(5)
                attempt_counter += 1
        study_uid = ""

        # Map to schema
        if xml_data["dbgapss"].get("Study"):
            if isinstance(xml_data["dbgapss"]["Study"], list):
                study_data = xml_data["dbgapss"]["Study"][0]
            else:
                study_data = xml_data["dbgapss"]["Study"] 
            study_uid = study_data.get("@uid")
            dbgap_xml_dict["studyName"] = study_data["StudyInfo"].get("StudyNameEntrez")
            dbgap_xml_dict["studyDescription"] = study_data["StudyInfo"].get("Description")
            dbgap_xml_dict["dbGaPPhsID"] = phs_id
            dbgap_xml_dict["dbGaPStudyRegistrationName"] = study_data["StudyInfo"].get("StudyNameEntrez")
            if study_data["Authority"]["Persons"].get("Person"):
                for ap_entry in study_data["Authority"]["Persons"]["Person"]:
                    if ap_entry["Role"] == "PI":
                        dbgap_xml_dict["piName"] = ap_entry["@lname"] + ", " + ap_entry["@fname"]
                        dbgap_xml_dict["piEmail"] = ap_entry["@email"]
                        dbgap_xml_dict["piInstitution"] = ap_entry["Organization"]
                    elif ap_entry["Role"] == "PO" and ap_entry["Organization"] == "NIH":
                        dbgap_xml_dict["nihProgramOfficerName"] = ap_entry["@lname"] + ", " + ap_entry["@fname"]
                    elif ap_entry["Role"] == "GPA" and ap_entry["Organization"] == "NIH":
                        dbgap_xml_dict["nihGenomicProgramAdministratorName"] = ap_entry["@lname"] + ", " + ap_entry["@fname"]
            ic_list = []
            if isinstance(study_data["Authority"]["ICs"]["IC"], list):
                for ic_entry in study_data["Authority"]["ICs"]["IC"]:
                    ic_list.append(ic_entry["@name"])
            else:
                ic_list.append(study_data["Authority"]["ICs"]["IC"]["@name"])
            dbgap_xml_dict["nihICsSupportingStudy"] = ic_list
            dbgap_xml_dict["consentGroups.numberOfParticipants"] = study_data.get("@num_participants")
            dbgap_xml_dict["embargoReleaseDate"] = study_data["Policy"].get("@pub-embargo")
    #             print("------------------------------------------------------")
    #             print("dbgap_xml_dict")
    #             print(dbgap_xml_dict)

        # Pull and parse Study API JSON
        if study_uid:
            dbgap_study_url = "https://submit.ncbi.nlm.nih.gov/dbgap/api/v1/study_config/" + str(study_uid)
            attempt_counter = 0
            while attempt_counter <= 2:
                try:
                    response = requests.get(url=dbgap_study_url)
                    study_api_data = json.loads(response.text)
                    break
                except:
                    sleep(5)
                    attempt_counter += 1

            # Map to schema
            if study_api_data.get("error") == None:
                dbgap_study_api_dict["studyName"] = study_api_data["data"].get("report_name")
                dbgap_study_api_dict["studyDescription"] = study_api_data["data"].get("description")
                dbgap_study_api_dict["phenotypeIndication"] = study_api_data["data"].get("primary_disease")
                dbgap_study_api_dict["studyType"] = study_api_data["data"].get("study_design")
                dbgap_study_api_dict["dbGaPPhsID"] = phs_id
                dbgap_study_api_dict["dbGaPStudyRegistrationName"] = study_api_data["data"].get("report_name")
                for attr_entry in study_api_data["data"].get("attribution"):
                    if attr_entry.get("title") == "Principal Investigator":
                        dbgap_study_api_dict["piName"] = attr_entry.get("name")
                        dbgap_study_api_dict["piInstitution"] = attr_entry.get("institute")
                        break
    #             print("------------------------------------------------------")
    #             print("dbgap_study_api_dict")
    #             print(dbgap_study_api_dict)

        # Pull and parse FHIR API JSON
        dbgap_fhir_url = "https://dbgap-api.ncbi.nlm.nih.gov/fhir/x1/ResearchStudy?_format=json&_id=" + phs_id
        attempt_counter = 0
        while attempt_counter <= 2:
            try:
                response = requests.get(url=dbgap_fhir_url)
                fhir_data = json.loads(response.text)
                break
            except:
                sleep(5)
                attempt_counter += 1

        # Map to schema
        if fhir_data.get("entry"):
            dbgap_fhir_dict["studyName"] = fhir_data["entry"][0]["resource"].get("title")
            dbgap_fhir_dict["studyDescription"] = fhir_data["entry"][0]["resource"].get("description")
            dbgap_fhir_dict["dbGaPPhsID"] = phs_id
            dbgap_fhir_dict["dbGaPStudyRegistrationName"] = fhir_data["entry"][0]["resource"].get("title")
            # NIH ICs
            if "Organization/" in fhir_data["entry"][0]["resource"]["sponsor"].get("reference"):
                dbgap_fhir_dict["nihICsSupportingStudy"] = [fhir_data["entry"][0]["resource"]["sponsor"].get("reference")[13:]]
            else:
                ic_display = fhir_data["entry"][0]["resource"]["sponsor"].get("display")
                if ic_display == "National Human Genome Research Institute":
                    dbgap_fhir_dict["nihICsSupportingStudy"] = ["NHGRI"]
                else:
                    dbgap_fhir_dict["nihICsSupportingStudy"] = [ic_display]
            # studyType
            if fhir_data["entry"][0]["resource"].get("category"):
                for cat_entry in fhir_data["entry"][0]["resource"].get("category"):
                    if cat_entry.get("coding"):
                        for coding_entry in cat_entry.get("coding"):
                            if coding_entry.get("system") == "https://dbgap-api.ncbi.nlm.nih.gov/fhir/x1/CodeSystem/ResearchStudy-StudyDesign":
                                value = coding_entry.get("display") if coding_entry.get("display") else coding_entry.get("code")
                                if dbgap_fhir_dict.get("studyType") and value:
                                    dbgap_fhir_dict["studyType"] += f", {value}"
                                elif value:
                                    dbgap_fhir_dict["studyType"] = value
            # dataTypes
            dt_list = []
            if fhir_data["entry"][0]["resource"].get("extension"): 
                for ext_entry in fhir_data["entry"][0]["resource"].get("extension"):
                    if ext_entry.get("url") == "https://dbgap-api.ncbi.nlm.nih.gov/fhir/x1/StructureDefinition/ResearchStudy-MolecularDataTypes":
                        for inner_ext_entry in ext_entry.get("extension"):
                            if inner_ext_entry.get("url") == "https://dbgap-api.ncbi.nlm.nih.gov/fhir/x1/StructureDefinition/ResearchStudy-MolecularDataTypes-MolecularDataType":
                                for coding_entry in inner_ext_entry["valueCodeableConcept"].get("coding"):
                                    dt_list.append(coding_entry.get("code"))
            dbgap_fhir_dict["dataTypes"] = dt_list
            # phenotypeIndication
            if fhir_data["entry"][0]["resource"].get("focus"):
                for focus_entry in fhir_data["entry"][0]["resource"].get("focus"):
                    if focus_entry.get("coding"):
                        for coding_entry in focus_entry.get("coding"):
                            value = coding_entry.get("display") if coding_entry.get("display") else coding_entry.get("code")
                            if dbgap_fhir_dict.get("phenotypeIndication") and value:
                                dbgap_fhir_dict["phenotypeIndication"] += f", {value}"
                            elif value:
                                dbgap_fhir_dict["phenotypeIndication"] = value
            # numberOfParticipants
            if fhir_data["entry"][0]["resource"].get("extension"):
                for ext_entry in fhir_data["entry"][0]["resource"].get("extension"):
                    if ext_entry.get("url") == "https://dbgap-api.ncbi.nlm.nih.gov/fhir/x1/StructureDefinition/ResearchStudy-Content":
                        for inner_ext_entry in ext_entry.get("extension"):
                            if inner_ext_entry.get("url") == "https://dbgap-api.ncbi.nlm.nih.gov/fhir/x1/StructureDefinition/ResearchStudy-Content-NumSubjects":
                                dbgap_fhir_dict["consentGroups.numberOfParticipants"] = inner_ext_entry["valueCount"].get("code")
    #         print("------------------------------------------------------")
    #         print("dbgap_fhir_dict")
    #         print(dbgap_fhir_dict)

    # Reconcile information and create final results
    consent_code = coalesce(snapshot_consent_code, terra_dict.get("consentGroups.consentCode"), dbgap_fhir_dict.get("consentGroups.consentCode"), dbgap_xml_dict.get("consentGroups.consentCode"), dbgap_study_api_dict.get("consentGroups.consentCode"))
    if consent_code:
        consent_code = consent_code.upper().replace("_", "-")
    else:
        consent_code = ""
    consortium = coalesce(duos_study_dict.get("consortium"), terra_dict.get("consortium"), dbgap_fhir_dict.get("consortium"), dbgap_xml_dict.get("consortium"), dbgap_study_api_dict.get("consortium"))
    dbGaPPhsID = coalesce(duos_study_dict.get("dbGaPPhsID"), dbgap_fhir_dict.get("dbGaPPhsID"), dbgap_xml_dict.get("dbGaPPhsID"), dbgap_study_api_dict.get("dbGaPPhsID"), terra_dict.get("dbGaPPhsID"))
    studyName = coalesce(duos_study_dict.get("studyName"), dbgap_fhir_dict.get("studyName"), dbgap_xml_dict.get("studyName"), dbgap_study_api_dict.get("studyName"), terra_dict.get("studyName"))
    if dbGaPPhsID and consent_code:
        study_consent = dbGaPPhsID + ":" + consent_code
        purl_doid = ds_consent_map.get(study_consent)
        if purl_doid:
            if not isinstance(purl_doid, list):
                purl_doid = [purl_doid]
        else:
            purl_doid = []
    else:
        purl_doid = []
    final_results_dict["snapshot_id"] = snapshot_id
    final_results_dict["snapshot_duos_id"] = duos_id
    final_results_dict["match_duos_id"] = match_duos_id
    final_results_dict["match_study_id"] = match_study_id
    if dbGaPPhsID and f" ({dbGaPPhsID})" not in final_results_dict["studyName"]:
        final_results_dict["studyName"] = studyName + f" ({dbGaPPhsID})"
    else:
        final_results_dict["studyName"] = studyName
    final_results_dict["studyType"] = coalesce(duos_study_dict.get("studyType"), dbgap_fhir_dict.get("studyType"), dbgap_xml_dict.get("studyType"), dbgap_study_api_dict.get("studyType"), terra_dict.get("studyType"))
    final_results_dict["studyDescription"] = format_description(coalesce(duos_study_dict.get("studyDescription"), dbgap_fhir_dict.get("studyDescription"), dbgap_xml_dict.get("studyDescription"), dbgap_study_api_dict.get("studyDescription"), terra_dict.get("studyDescription")))
    if final_results_dict["studyDescription"]:
        if "Platform: AnVIL" not in final_results_dict["studyDescription"]:
            final_results_dict["studyDescription"] = final_results_dict["studyDescription"] + "\nPlatform: AnVIL"
    else:
        final_results_dict["studyDescription"] = "Platform: AnVIL"
    final_results_dict["dataTypes"] = coalesce(duos_study_dict.get("dataTypes"), terra_dict.get("dataTypes"), dbgap_fhir_dict.get("dataTypes"), dbgap_xml_dict.get("dataTypes"), dbgap_study_api_dict.get("dataTypes"))
    final_results_dict["phenotypeIndication"] = coalesce(duos_study_dict.get("phenotypeIndication"), terra_dict.get("phenotypeIndication"), dbgap_fhir_dict.get("phenotypeIndication"), dbgap_xml_dict.get("phenotypeIndication"), dbgap_study_api_dict.get("phenotypeIndication"))
    final_results_dict["species"] = "Human"
    final_results_dict["piName"] = coalesce(duos_study_dict.get("piName"), dbgap_fhir_dict.get("piName"), dbgap_xml_dict.get("piName"), dbgap_study_api_dict.get("piName"), terra_dict.get("piName"), "None")
    final_results_dict["dataCustodianEmail"] = ["help@lists.anvilproject.org"]
    final_results_dict["publicVisibility"] = True
    final_results_dict["nihAnvilUse"] = "I am NHGRI funded and I have a dbGaP PHS ID already" if dbGaPPhsID else "I am NHGRI funded and I do not have a dbGaP PHS ID"
    final_results_dict["submittingToAnvil"] = True
    final_results_dict["dbGaPPhsID"] = dbGaPPhsID
    final_results_dict["dbGaPStudyRegistrationName"] = coalesce(duos_study_dict.get("dbGaPStudyRegistrationName"), dbgap_fhir_dict.get("dbGaPStudyRegistrationName"), dbgap_xml_dict.get("dbGaPStudyRegistrationName"), dbgap_study_api_dict.get("dbGaPStudyRegistrationName"), terra_dict.get("dbGaPStudyRegistrationName"))
    final_results_dict["embargoReleaseDate"] = coalesce(duos_study_dict.get("embargoReleaseDate"), dbgap_fhir_dict.get("embargoReleaseDate"), dbgap_xml_dict.get("embargoReleaseDate"), dbgap_study_api_dict.get("embargoReleaseDate"), terra_dict.get("embargoReleaseDate"))
    final_results_dict["sequencingCenter"] = None
    final_results_dict["piEmail"] = coalesce(duos_study_dict.get("piEmail"), dbgap_fhir_dict.get("piEmail"), dbgap_xml_dict.get("piEmail"), dbgap_study_api_dict.get("piEmail"), terra_dict.get("piEmail"))
    final_results_dict["piInstitution"] = coalesce(duos_study_dict.get("piInstitution"), dbgap_fhir_dict.get("piInstitution"), dbgap_xml_dict.get("piInstitution"), dbgap_study_api_dict.get("piInstitution"), terra_dict.get("piInstitution"))
    final_results_dict["nihGrantContractNumber"] = None
    final_results_dict["nihICsSupportingStudy"] = coalesce(duos_study_dict.get("nihICsSupportingStudy"), dbgap_fhir_dict.get("nihICsSupportingStudy"), dbgap_xml_dict.get("nihICsSupportingStudy"), dbgap_study_api_dict.get("nihICsSupportingStudy"), terra_dict.get("nihICsSupportingStudy"))
    final_results_dict["nihProgramOfficerName"] = coalesce(duos_study_dict.get("nihProgramOfficerName"), dbgap_fhir_dict.get("nihProgramOfficerName"), dbgap_xml_dict.get("nihProgramOfficerName"), dbgap_study_api_dict.get("nihProgramOfficerName"), terra_dict.get("nihProgramOfficerName"))
    final_results_dict["nihInstitutionCenterSubmission"] = "NHGRI"
    final_results_dict["nihInstitutionalCertificationFileName"] = None
    final_results_dict["nihGenomicProgramAdministratorName"] = coalesce(duos_study_dict.get("nihGenomicProgramAdministratorName"), dbgap_fhir_dict.get("nihGenomicProgramAdministratorName"), dbgap_xml_dict.get("nihGenomicProgramAdministratorName"), dbgap_study_api_dict.get("nihGenomicProgramAdministratorName"), terra_dict.get("nihGenomicProgramAdministratorName"))
    final_results_dict["multiCenterStudy"] = None
    final_results_dict["collaboratingSites"] = [consortium] if consortium else []
    final_results_dict["controlledAccessRequiredForGenomicSummaryResultsGSR"] = None
    final_results_dict["controlledAccessRequiredForGenomicSummaryResultsGSRRequiredExplanation"] = None
    final_results_dict["alternativeDataSharingPlan"] = False
    final_results_dict["alternativeDataSharingPlanReasons"] = []
    final_results_dict["alternativeDataSharingPlanExplanation"] = None
    final_results_dict["alternativeDataSharingPlanFileName"] = None
    final_results_dict["alternativeDataSharingPlanDataSubmitted"] = None
    final_results_dict["alternativeDataSharingPlanDataReleased"] = None
    final_results_dict["alternativeDataSharingPlanTargetDeliveryDate"] = None
    final_results_dict["alternativeDataSharingPlanTargetPublicReleaseDate"] = None
    final_results_dict["alternativeDataSharingPlanAccessManagement"] = None
    final_results_dict["consentGroups.consentGroupName"] = consent_group_name
    final_results_dict["consentGroups.accessManagement"] = access_management
    final_results_dict["consentGroups.numberOfParticipants"] = coalesce(terra_dict.get("consentGroups.numberOfParticipants"), dbgap_fhir_dict.get("consentGroups.numberOfParticipants"), dbgap_xml_dict.get("consentGroups.numberOfParticipants"), dbgap_study_api_dict.get("consentGroups.numberOfParticipants"), "0")
    final_results_dict["consentCode"] = consent_code
    final_results_dict["consentGroups.generalResearchUse"] = True if access_management == "controlled" and "GRU" in consent_code else False
    final_results_dict["consentGroups.hmb"] = True if access_management == "controlled" and "HMB" in consent_code else False
    if purl_doid:
        final_results_dict["consentGroups.diseaseSpecificUse"] = purl_doid
    else:
        final_results_dict["consentGroups.diseaseSpecificUse"] = []
    final_results_dict["consentGroups.gs"] = consent_code if access_management == "controlled" and "GS-" in consent_code else None
    final_results_dict["consentGroups.poa"] = True if access_management == "controlled" and "POA" in consent_code else False
    final_results_dict["consentGroups.nmds"] = True if access_management == "controlled" and "NMDS" in consent_code else False
    final_results_dict["consentGroups.gso"] = True if access_management == "controlled" and "GSO" in consent_code else False
    final_results_dict["consentGroups.pub"] = True if access_management == "controlled" and "PUB" in consent_code else False 
    final_results_dict["consentGroups.col"] = True if access_management == "controlled" and "COL-" in consent_code else False
    final_results_dict["consentGroups.irb"] = True if access_management == "controlled" and "IRB" in consent_code else False
    final_results_dict["consentGroups.npu"] = True if access_management == "controlled" and "NPU" in consent_code else False
    final_results_dict["consentGroups.otherPrimary"] = consent_code if (consent_code and access_management == "controlled" and not (final_results_dict["consentGroups.generalResearchUse"] or final_results_dict["consentGroups.hmb"] or final_results_dict["consentGroups.diseaseSpecificUse"] or final_results_dict["consentGroups.gs"] or final_results_dict["consentGroups.poa"] or final_results_dict["consentGroups.nmds"] or final_results_dict["consentGroups.gso"] or final_results_dict["consentGroups.pub"] or final_results_dict["consentGroups.col"] or final_results_dict["consentGroups.irb"] or final_results_dict["consentGroups.npu"])) else None
    final_results_dict["consentGroups.otherSecondary"] = None
    final_results_dict["consentGroups.mor"] = None
    final_results_dict["consentGroups.morDate"] = None
    final_results_dict["consentGroups.dataLocation"] = "TDR Location"
    final_results_dict["consentGroups.url"] = "https://data.terra.bio/snapshots/" + snapshot_id
    final_results_dict["consentGroups.fileTypes.fileType"] = coalesce(terra_dict.get("consentGroups.fileTypes.fileType"), dbgap_fhir_dict.get("consentGroups.fileTypes.fileType"), dbgap_xml_dict.get("consentGroups.fileTypes.fileType"), dbgap_study_api_dict.get("consentGroups.fileTypes.fileType"))
    final_results_dict["consentGroups.fileTypes.functionalEquivalence"] = None
    final_results_dict["consortium"] = consortium
    dataset_details_records.append(final_results_dict)
    
# Return results
#return dataset_details_records
            

In [None]:
final_results_dict

In [None]:
dataset_lookup

In [None]:
study_lookup

In [None]:

# Loop through new datasets API
    # Create base consent_group_name from dataset_name to use for comparison (see above regex)
    # Attempt to match the dataset based on the consent group name
        # If match found, stop looking, assign the "identifer" as the target DUOS ID and "study_id" as the target study id
# Loop through study lookup
    # Attempt to match snapshot PHS ID to a PHS ID for a study
        # If match found, stop looking, and assign this as another target study ID
#     If DUO on Snapshot, use that for both Study and Dataset information
#     If dataset match, use that for both Study and Dataset information
#     If no dataset match, try study match. If study match, use that for Study information


# Pull information from existing DUOS registration (if listed)
if duos_id:
    # Establish credentials
    creds, project = google.auth.default()
    auth_req = google.auth.transport.requests.Request()
    creds.refresh(auth_req)

    # Pull existing DUOS study registration
    duos_dict = requests.get(
        url=f"https://consent.dsde-prod.broadinstitute.org/api/dataset/registration/{duos_id}",
        headers={"Authorization": f"Bearer {creds.token}"}
    ).json()
#         print(duos_dict)

    # Pull dataset details from DUOS (to get data use info) 
    duos_dataset_id = duos_dict["consentGroups"][0].get("datasetId")
    duos_data_use_dict = {}
    if duos_dataset_id:
        dataset_details = requests.get(
            url=f"https://consent.dsde-prod.broadinstitute.org/api/dataset/v2/{duos_dataset_id}",
            headers={"Authorization": f"Bearer {creds.token}"}
        ).json()
        duos_data_use_dict = dataset_details.get("dataUse")
#         print(duos_data_use_dict)

# Pull information from DUOS

# Pull information from original workspace (if listed)
if source_workspace:
    # Establish credentials
    creds, project = google.auth.default()
    auth_req = google.auth.transport.requests.Request()
    creds.refresh(auth_req)

    # Pull workspace attributes
    attempt_counter = 0
    while attempt_counter <= 2:
        try:
            ws_attributes = requests.get(
                url=f"https://api.firecloud.org/api/workspaces/anvil-datastorage/{source_workspace}?fields=workspace.attributes,workspace.authorizationDomain,workspace.googleProject,workspace.bucketName",
                headers={"Authorization": f"Bearer {creds.token}"}
            ).json()
            break
        except:
            sleep(5)
            attempt_counter += 1

    # Map to schema
    if ws_attributes.get("workspace"):
        terra_dict["studyName"] = coalesce(ws_attributes["workspace"]["attributes"].get("library:projectName"), source_workspace) 
        terra_dict["studyType"] = ws_attributes["workspace"]["attributes"].get("library:studyDesign")
        terra_dict["studyDescription"] = ws_attributes["workspace"]["attributes"].get("description")
        if ws_attributes["workspace"]["attributes"].get("library:dataCategory"):
            terra_dict["dataTypes"] = []
            for item in ws_attributes["workspace"]["attributes"]["library:dataCategory"]["items"]:
                inner_list = item.split(",")
                for inner_item in inner_list:
                    inner_item = inner_item.replace("'", "").strip()
                    terra_dict["dataTypes"].append(inner_item)
        terra_dict["phenotypeIndication"] = ws_attributes["workspace"]["attributes"].get("library:indication")
        terra_dict["species"] = "Homo sapiens"
        terra_dict["piName"] = ws_attributes["workspace"]["attributes"].get("library:datasetOwner")
        terra_dict["dataCustodianEmail"] = [ws_attributes["workspace"]["attributes"].get("library:contactEmail")]
        if ws_attributes["workspace"]["attributes"].get("tag:tags"):
            for tag in ws_attributes["workspace"]["attributes"].get("tag:tags")["items"]:
                if "Consortium:" in tag:
                    terra_dict["consortium"] = tag.split(":")[1].strip()
                elif "dbGaP:" in tag:
                    terra_dict["dbGaPPhsID"] = format_phs_id(tag.split(":")[1].strip())
                    if not phs_id:
                        phs_id = format_phs_id(tag.split(":")[1].strip()) 
        terra_dict["consentGroups.consentCode"] = ws_attributes["workspace"]["attributes"].get("library:dataUseRestriction")
        if ws_attributes["workspace"]["attributes"].get("library:datatype"):
            terra_dict["consentGroups.fileTypes.fileType"] = ws_attributes["workspace"]["attributes"]["library:datatype"]["items"]
        if ws_attributes["workspace"]["attributes"].get("library:numSubjects"):
            terra_dict["consentGroups.numberOfParticipants"] = ws_attributes["workspace"]["attributes"]["library:numSubjects"]
#         print("------------------------------------------------------")
#         print("terra_dict")
#         print(terra_dict)

# Pull information from dbGaP (if phs_id listed)
#     print("PHS ID for dbGaP: " + phs_id)
if phs_id:
    # Pull and parse XML
    phs_short = phs_id.replace("phs", "")
    dbgap_url = "https://dbgap.ncbi.nlm.nih.gov/ss/dbgapssws.cgi?request=Study&phs=" + phs_short
    attempt_counter = 0
    while attempt_counter <= 2:
        try:
            response = requests.get(url=dbgap_url)
            xml_data = xmltodict.parse(response.text)
            break
        except:
            sleep(5)
            attempt_counter += 1
    study_uid = ""

    # Map to schema
    if xml_data["dbgapss"].get("Study"):
        if isinstance(xml_data["dbgapss"]["Study"], list):
            study_data = xml_data["dbgapss"]["Study"][0]
        else:
            study_data = xml_data["dbgapss"]["Study"] 
        study_uid = study_data.get("@uid")
        dbgap_xml_dict["studyName"] = study_data["StudyInfo"].get("StudyNameEntrez")
        dbgap_xml_dict["studyDescription"] = study_data["StudyInfo"].get("Description")
        dbgap_xml_dict["dbGaPPhsID"] = phs_id
        dbgap_xml_dict["dbGaPStudyRegistrationName"] = study_data["StudyInfo"].get("StudyNameEntrez")
        if study_data["Authority"]["Persons"].get("Person"):
            for ap_entry in study_data["Authority"]["Persons"]["Person"]:
                if ap_entry["Role"] == "PI":
                    dbgap_xml_dict["piName"] = ap_entry["@lname"] + ", " + ap_entry["@fname"]
                    dbgap_xml_dict["piEmail"] = ap_entry["@email"]
                    dbgap_xml_dict["piInstitution"] = ap_entry["Organization"]
                elif ap_entry["Role"] == "PO" and ap_entry["Organization"] == "NIH":
                    dbgap_xml_dict["nihProgramOfficerName"] = ap_entry["@lname"] + ", " + ap_entry["@fname"]
                elif ap_entry["Role"] == "GPA" and ap_entry["Organization"] == "NIH":
                    dbgap_xml_dict["nihGenomicProgramAdministratorName"] = ap_entry["@lname"] + ", " + ap_entry["@fname"]
        ic_list = []
        if isinstance(study_data["Authority"]["ICs"]["IC"], list):
            for ic_entry in study_data["Authority"]["ICs"]["IC"]:
                ic_list.append(ic_entry["@name"])
        else:
            ic_list.append(study_data["Authority"]["ICs"]["IC"]["@name"])
        dbgap_xml_dict["nihICsSupportingStudy"] = ic_list
        dbgap_xml_dict["consentGroups.numberOfParticipants"] = study_data.get("@num_participants")
        dbgap_xml_dict["embargoReleaseDate"] = study_data["Policy"].get("@pub-embargo")
#             print("------------------------------------------------------")
#             print("dbgap_xml_dict")
#             print(dbgap_xml_dict)

    # Pull and parse Study API JSON
    if study_uid:
        dbgap_study_url = "https://submit.ncbi.nlm.nih.gov/dbgap/api/v1/study_config/" + str(study_uid)
        attempt_counter = 0
        while attempt_counter <= 2:
            try:
                response = requests.get(url=dbgap_study_url)
                study_api_data = json.loads(response.text)
                break
            except:
                sleep(5)
                attempt_counter += 1

        # Map to schema
        if study_api_data.get("error") == None:
            dbgap_study_api_dict["studyName"] = study_api_data["data"].get("report_name")
            dbgap_study_api_dict["studyDescription"] = study_api_data["data"].get("description")
            dbgap_study_api_dict["phenotypeIndication"] = study_api_data["data"].get("primary_disease")
            dbgap_study_api_dict["studyType"] = study_api_data["data"].get("study_design")
            dbgap_study_api_dict["dbGaPPhsID"] = phs_id
            dbgap_study_api_dict["dbGaPStudyRegistrationName"] = study_api_data["data"].get("report_name")
            for attr_entry in study_api_data["data"].get("attribution"):
                if attr_entry.get("title") == "Principal Investigator":
                    dbgap_study_api_dict["piName"] = attr_entry.get("name")
                    dbgap_study_api_dict["piInstitution"] = attr_entry.get("institute")
                    break
#             print("------------------------------------------------------")
#             print("dbgap_study_api_dict")
#             print(dbgap_study_api_dict)

    # Pull and parse FHIR API JSON
    dbgap_fhir_url = "https://dbgap-api.ncbi.nlm.nih.gov/fhir/x1/ResearchStudy?_format=json&_id=" + phs_id
    attempt_counter = 0
    while attempt_counter <= 2:
        try:
            response = requests.get(url=dbgap_fhir_url)
            fhir_data = json.loads(response.text)
            break
        except:
            sleep(5)
            attempt_counter += 1

    # Map to schema
    if fhir_data.get("entry"):
        dbgap_fhir_dict["studyName"] = fhir_data["entry"][0]["resource"].get("title")
        dbgap_fhir_dict["studyDescription"] = fhir_data["entry"][0]["resource"].get("description")
        dbgap_fhir_dict["dbGaPPhsID"] = phs_id
        dbgap_fhir_dict["dbGaPStudyRegistrationName"] = fhir_data["entry"][0]["resource"].get("title")
        # NIH ICs
        if "Organization/" in fhir_data["entry"][0]["resource"]["sponsor"].get("reference"):
            dbgap_fhir_dict["nihICsSupportingStudy"] = [fhir_data["entry"][0]["resource"]["sponsor"].get("reference")[13:]]
        else:
            ic_display = fhir_data["entry"][0]["resource"]["sponsor"].get("display")
            if ic_display == "National Human Genome Research Institute":
                dbgap_fhir_dict["nihICsSupportingStudy"] = ["NHGRI"]
            else:
                dbgap_fhir_dict["nihICsSupportingStudy"] = [ic_display]
        # studyType
        if fhir_data["entry"][0]["resource"].get("category"):
            for cat_entry in fhir_data["entry"][0]["resource"].get("category"):
                if cat_entry.get("coding"):
                    for coding_entry in cat_entry.get("coding"):
                        if coding_entry.get("system") == "https://dbgap-api.ncbi.nlm.nih.gov/fhir/x1/CodeSystem/ResearchStudy-StudyDesign":
                            value = coding_entry.get("display") if coding_entry.get("display") else coding_entry.get("code")
                            if dbgap_fhir_dict.get("studyType") and value:
                                dbgap_fhir_dict["studyType"] += f", {value}"
                            elif value:
                                dbgap_fhir_dict["studyType"] = value
        # dataTypes
        dt_list = []
        if fhir_data["entry"][0]["resource"].get("extension"): 
            for ext_entry in fhir_data["entry"][0]["resource"].get("extension"):
                if ext_entry.get("url") == "https://dbgap-api.ncbi.nlm.nih.gov/fhir/x1/StructureDefinition/ResearchStudy-MolecularDataTypes":
                    for inner_ext_entry in ext_entry.get("extension"):
                        if inner_ext_entry.get("url") == "https://dbgap-api.ncbi.nlm.nih.gov/fhir/x1/StructureDefinition/ResearchStudy-MolecularDataTypes-MolecularDataType":
                            for coding_entry in inner_ext_entry["valueCodeableConcept"].get("coding"):
                                dt_list.append(coding_entry.get("code"))
        dbgap_fhir_dict["dataTypes"] = dt_list
        # phenotypeIndication
        if fhir_data["entry"][0]["resource"].get("focus"):
            for focus_entry in fhir_data["entry"][0]["resource"].get("focus"):
                if focus_entry.get("coding"):
                    for coding_entry in focus_entry.get("coding"):
                        value = coding_entry.get("display") if coding_entry.get("display") else coding_entry.get("code")
                        if dbgap_fhir_dict.get("phenotypeIndication") and value:
                            dbgap_fhir_dict["phenotypeIndication"] += f", {value}"
                        elif value:
                            dbgap_fhir_dict["phenotypeIndication"] = value
        # numberOfParticipants
        if fhir_data["entry"][0]["resource"].get("extension"):
            for ext_entry in fhir_data["entry"][0]["resource"].get("extension"):
                if ext_entry.get("url") == "https://dbgap-api.ncbi.nlm.nih.gov/fhir/x1/StructureDefinition/ResearchStudy-Content":
                    for inner_ext_entry in ext_entry.get("extension"):
                        if inner_ext_entry.get("url") == "https://dbgap-api.ncbi.nlm.nih.gov/fhir/x1/StructureDefinition/ResearchStudy-Content-NumSubjects":
                            dbgap_fhir_dict["consentGroups.numberOfParticipants"] = inner_ext_entry["valueCount"].get("code")
#         print("------------------------------------------------------")
#         print("dbgap_fhir_dict")
#         print(dbgap_fhir_dict)

# Reconcile information and create final results
consent_code = coalesce(snapshot_consent_code, terra_dict.get("consentGroups.consentCode"), dbgap_fhir_dict.get("consentGroups.consentCode"), dbgap_xml_dict.get("consentGroups.consentCode"), dbgap_study_api_dict.get("consentGroups.consentCode"))
if consent_code:
    consent_code = consent_code.upper().replace("_", "-")
else:
    consent_code = ""
consortium = coalesce(terra_dict.get("consortium"), dbgap_fhir_dict.get("consortium"), dbgap_xml_dict.get("consortium"), dbgap_study_api_dict.get("consortium"))
dbGaPPhsID = coalesce(dbgap_fhir_dict.get("dbGaPPhsID"), dbgap_xml_dict.get("dbGaPPhsID"), dbgap_study_api_dict.get("dbGaPPhsID"), terra_dict.get("dbGaPPhsID"))
studyName = coalesce(dbgap_fhir_dict.get("studyName"), dbgap_xml_dict.get("studyName"), dbgap_study_api_dict.get("studyName"), terra_dict.get("studyName"))
if dbGaPPhsID and consent_code:
    study_consent = dbGaPPhsID + ":" + consent_code
    purl_doid = ds_consent_map.get(study_consent)
    if purl_doid:
        if not isinstance(purl_doid, list):
            purl_doid = [purl_doid]
    else:
        purl_doid = []
else:
    purl_doid = []
final_results_dict["snapshot_id"] = snapshot_id
final_results_dict["duos_id"] = duos_id
try:
    consent_group_name = re.search(r'(.*)_[0-9]{8}_ANV[0-9]+_[0-9]{12}$', snapshot_name).group(1)
except:
    consent_group_name = snapshot_name
if duos_id:
    final_results_dict["studyName"] = duos_dict.get("studyName")
    final_results_dict["studyType"] = duos_dict.get("studyType")
    final_results_dict["studyDescription"] = duos_dict.get("studyDescription")
    final_results_dict["dataTypes"] = duos_dict.get("dataTypes")
    final_results_dict["phenotypeIndication"] = duos_dict.get("phenotypeIndication")
    final_results_dict["species"] = duos_dict.get("species")
    final_results_dict["piName"] = duos_dict.get("piName")
    final_results_dict["dataCustodianEmail"] = duos_dict.get("dataCustodianEmail")
    final_results_dict["publicVisibility"] = duos_dict.get("publicVisibility")
    final_results_dict["nihAnvilUse"] = "I am NHGRI funded and I have a dbGaP PHS ID already" if 'already' in duos_dict.get("nihAnvilUse").lower() else "I am NHGRI funded and I do not have a dbGaP PHS ID"
    final_results_dict["submittingToAnvil"] = duos_dict.get("submittingToAnvil")
    final_results_dict["dbGaPPhsID"] = duos_dict.get("dbGaPPhsID")
    final_results_dict["dbGaPStudyRegistrationName"] = duos_dict.get("dbGaPStudyRegistrationName")
    final_results_dict["embargoReleaseDate"] = duos_dict.get("embargoReleaseDate")
    final_results_dict["sequencingCenter"] = duos_dict.get("sequencingCenter")
    final_results_dict["piEmail"] = duos_dict.get("piEmail")
    final_results_dict["piInstitution"] = duos_dict.get("piInstitution")
    final_results_dict["nihGrantContractNumber"] = duos_dict.get("nihGrantContractNumber")
    final_results_dict["nihICsSupportingStudy"] = duos_dict.get("nihICsSupportingStudy")
    final_results_dict["nihProgramOfficerName"] = duos_dict.get("nihProgramOfficerName")
    final_results_dict["nihInstitutionCenterSubmission"] = duos_dict.get("nihInstitutionCenterSubmission")
    final_results_dict["nihInstitutionalCertificationFileName"] = duos_dict.get("nihInstitutionalCertificationFileName")
    final_results_dict["nihGenomicProgramAdministratorName"] = duos_dict.get("nihGenomicProgramAdministratorName")
    final_results_dict["multiCenterStudy"] = duos_dict.get("multiCenterStudy")
    final_results_dict["collaboratingSites"] = duos_dict.get("collaboratingSites")
    final_results_dict["controlledAccessRequiredForGenomicSummaryResultsGSR"] = duos_dict.get("controlledAccessRequiredForGenomicSummaryResultsGSR")
    final_results_dict["controlledAccessRequiredForGenomicSummaryResultsGSRRequiredExplanation"] = duos_dict.get("controlledAccessRequiredForGenomicSummaryResultsGSRRequiredExplanation")
    final_results_dict["alternativeDataSharingPlan"] = duos_dict.get("alternativeDataSharingPlan")
    final_results_dict["alternativeDataSharingPlanReasons"] = duos_dict.get("alternativeDataSharingPlanReasons")
    final_results_dict["alternativeDataSharingPlanExplanation"] = duos_dict.get("alternativeDataSharingPlanExplanation")
    final_results_dict["alternativeDataSharingPlanFileName"] = duos_dict.get("alternativeDataSharingPlanFileName")
    final_results_dict["alternativeDataSharingPlanDataSubmitted"] = duos_dict.get("alternativeDataSharingPlanDataSubmitted")
    final_results_dict["alternativeDataSharingPlanDataReleased"] = duos_dict.get("alternativeDataSharingPlanDataReleased")
    final_results_dict["alternativeDataSharingPlanTargetDeliveryDate"] = duos_dict.get("alternativeDataSharingPlanTargetDeliveryDate")
    final_results_dict["alternativeDataSharingPlanTargetPublicReleaseDate"] = duos_dict.get("alternativeDataSharingPlanTargetPublicReleaseDate")
    final_results_dict["alternativeDataSharingPlanAccessManagement"] = duos_dict.get("alternativeDataSharingPlanAccessManagement")
    final_results_dict["consentGroups.consentGroupName"] = consent_group_name
    final_results_dict["consentGroups.accessManagement"] = access_management
    final_results_dict["consentGroups.numberOfParticipants"] = duos_dict["consentGroups"][0].get("numberOfParticipants")
    final_results_dict["consentCode"] = consent_code
    final_results_dict["consentGroups.generalResearchUse"] = coalesce(duos_dict["consentGroups"][0].get("generalResearchUse"), duos_data_use_dict.get("generalUse"), False)
    final_results_dict["consentGroups.hmb"] = coalesce(duos_dict["consentGroups"][0].get("hmb"), duos_data_use_dict.get("hmbResearch"), False)
    final_results_dict["consentGroups.diseaseSpecificUse"] = coalesce(duos_dict["consentGroups"][0].get("diseaseSpecificUse"), duos_data_use_dict.get("diseaseRestrictions"), [])
    final_results_dict["consentGroups.gs"] = coalesce(duos_dict["consentGroups"][0].get("gs"), duos_data_use_dict.get("geographicalRestrictions"))
    final_results_dict["consentGroups.poa"] = coalesce(duos_dict["consentGroups"][0].get("poa"), duos_data_use_dict.get("populationOriginsAncestry"), False)
    final_results_dict["consentGroups.nmds"] = coalesce(duos_dict["consentGroups"][0].get("nmds"), False)
    final_results_dict["consentGroups.gso"] = coalesce(duos_dict["consentGroups"][0].get("gso"), duos_data_use_dict.get("geneticStudiesOnly"), False)
    final_results_dict["consentGroups.pub"] = coalesce(duos_dict["consentGroups"][0].get("pub"), duos_data_use_dict.get("publicationResults"), False)
    final_results_dict["consentGroups.col"] = coalesce(duos_dict["consentGroups"][0].get("col"), duos_data_use_dict.get("collaboratorRequired"), False)
    final_results_dict["consentGroups.irb"] = coalesce(duos_dict["consentGroups"][0].get("irb"), duos_data_use_dict.get("ethicsApprovalRequired"), False)
    final_results_dict["consentGroups.npu"] = coalesce(duos_dict["consentGroups"][0].get("npu"), False)
    final_results_dict["consentGroups.otherPrimary"] = duos_dict["consentGroups"][0].get("otherPrimary")
    final_results_dict["consentGroups.otherSecondary"] = duos_dict["consentGroups"][0].get("otherSecondary")
    final_results_dict["consentGroups.mor"] = duos_dict["consentGroups"][0].get("mor")
    final_results_dict["consentGroups.morDate"] = duos_dict["consentGroups"][0].get("morDate")
    final_results_dict["consentGroups.dataLocation"] = "TDR Location"
    final_results_dict["consentGroups.url"] = "https://data.terra.bio/snapshots/" + snapshot_id
    if duos_dict["consentGroups"][0]["fileTypes"] and duos_dict["consentGroups"][0]["fileTypes"].get("fileType"):
        final_results_dict["consentGroups.fileTypes.fileType"] = duos_dict["consentGroups"][0]["fileTypes"][0].get("fileType")
    else:
        final_results_dict["consentGroups.fileTypes.fileType"] = None
    if duos_dict["consentGroups"][0]["fileTypes"] and duos_dict["consentGroups"][0]["fileTypes"].get("functionalEquivalence"):
        final_results_dict["consentGroups.fileTypes.functionalEquivalence"] = duos_dict["consentGroups"][0]["fileTypes"][0].get("functionalEquivalence")
    else:
        final_results_dict["consentGroups.fileTypes.functionalEquivalence"] = None
    final_results_dict["consortium"] = consortium
else:
    if dbGaPPhsID:
        final_results_dict["studyName"] = studyName + f" ({dbGaPPhsID})"
    else:
        final_results_dict["studyName"] = studyName
    final_results_dict["studyType"] = coalesce(dbgap_fhir_dict.get("studyType"), dbgap_xml_dict.get("studyType"), dbgap_study_api_dict.get("studyType"), terra_dict.get("studyType"))
    final_results_dict["studyDescription"] = format_description(coalesce(dbgap_fhir_dict.get("studyDescription"), dbgap_xml_dict.get("studyDescription"), dbgap_study_api_dict.get("studyDescription"), terra_dict.get("studyDescription")))
    if final_results_dict["studyDescription"]:
        final_results_dict["studyDescription"] = final_results_dict["studyDescription"] + "\nPlatform: AnVIL"
    else:
        final_results_dict["studyDescription"] = "Platform: AnVIL"
    final_results_dict["dataTypes"] = coalesce(terra_dict.get("dataTypes"), dbgap_fhir_dict.get("dataTypes"), dbgap_xml_dict.get("dataTypes"), dbgap_study_api_dict.get("dataTypes"))
    final_results_dict["phenotypeIndication"] = coalesce(terra_dict.get("phenotypeIndication"), dbgap_fhir_dict.get("phenotypeIndication"), dbgap_xml_dict.get("phenotypeIndication"), dbgap_study_api_dict.get("phenotypeIndication"))
    final_results_dict["species"] = "Human"
    final_results_dict["piName"] = coalesce(dbgap_fhir_dict.get("piName"), dbgap_xml_dict.get("piName"), dbgap_study_api_dict.get("piName"), terra_dict.get("piName"), "None")
    final_results_dict["dataCustodianEmail"] = ["help@lists.anvilproject.org"]
    final_results_dict["publicVisibility"] = True
    final_results_dict["nihAnvilUse"] = "I am NHGRI funded and I have a dbGaP PHS ID already" if dbGaPPhsID else "I am NHGRI funded and I do not have a dbGaP PHS ID"
    final_results_dict["submittingToAnvil"] = True
    final_results_dict["dbGaPPhsID"] = dbGaPPhsID
    final_results_dict["dbGaPStudyRegistrationName"] = coalesce(dbgap_fhir_dict.get("dbGaPStudyRegistrationName"), dbgap_xml_dict.get("dbGaPStudyRegistrationName"), dbgap_study_api_dict.get("dbGaPStudyRegistrationName"), terra_dict.get("dbGaPStudyRegistrationName"))
    final_results_dict["embargoReleaseDate"] = coalesce(dbgap_fhir_dict.get("embargoReleaseDate"), dbgap_xml_dict.get("embargoReleaseDate"), dbgap_study_api_dict.get("embargoReleaseDate"), terra_dict.get("embargoReleaseDate"))
    final_results_dict["sequencingCenter"] = None
    final_results_dict["piEmail"] = coalesce(dbgap_fhir_dict.get("piEmail"), dbgap_xml_dict.get("piEmail"), dbgap_study_api_dict.get("piEmail"), terra_dict.get("piEmail"))
    final_results_dict["piInstitution"] = coalesce(dbgap_fhir_dict.get("piInstitution"), dbgap_xml_dict.get("piInstitution"), dbgap_study_api_dict.get("piInstitution"), terra_dict.get("piInstitution"))
    final_results_dict["nihGrantContractNumber"] = None
    final_results_dict["nihICsSupportingStudy"] = coalesce(dbgap_fhir_dict.get("nihICsSupportingStudy"), dbgap_xml_dict.get("nihICsSupportingStudy"), dbgap_study_api_dict.get("nihICsSupportingStudy"), terra_dict.get("nihICsSupportingStudy"))
    final_results_dict["nihProgramOfficerName"] = coalesce(dbgap_fhir_dict.get("nihProgramOfficerName"), dbgap_xml_dict.get("nihProgramOfficerName"), dbgap_study_api_dict.get("nihProgramOfficerName"), terra_dict.get("nihProgramOfficerName"))
    final_results_dict["nihInstitutionCenterSubmission"] = "NHGRI"
    final_results_dict["nihInstitutionalCertificationFileName"] = None
    final_results_dict["nihGenomicProgramAdministratorName"] = coalesce(dbgap_fhir_dict.get("nihGenomicProgramAdministratorName"), dbgap_xml_dict.get("nihGenomicProgramAdministratorName"), dbgap_study_api_dict.get("nihGenomicProgramAdministratorName"), terra_dict.get("nihGenomicProgramAdministratorName"))
    final_results_dict["multiCenterStudy"] = None
    final_results_dict["collaboratingSites"] = [consortium] if consortium else []
    final_results_dict["controlledAccessRequiredForGenomicSummaryResultsGSR"] = None
    final_results_dict["controlledAccessRequiredForGenomicSummaryResultsGSRRequiredExplanation"] = None
    final_results_dict["alternativeDataSharingPlan"] = False
    final_results_dict["alternativeDataSharingPlanReasons"] = []
    final_results_dict["alternativeDataSharingPlanExplanation"] = None
    final_results_dict["alternativeDataSharingPlanFileName"] = None
    final_results_dict["alternativeDataSharingPlanDataSubmitted"] = None
    final_results_dict["alternativeDataSharingPlanDataReleased"] = None
    final_results_dict["alternativeDataSharingPlanTargetDeliveryDate"] = None
    final_results_dict["alternativeDataSharingPlanTargetPublicReleaseDate"] = None
    final_results_dict["alternativeDataSharingPlanAccessManagement"] = None
    final_results_dict["consentGroups.consentGroupName"] = consent_group_name
    final_results_dict["consentGroups.accessManagement"] = access_management
    final_results_dict["consentGroups.numberOfParticipants"] = coalesce(terra_dict.get("consentGroups.numberOfParticipants"), dbgap_fhir_dict.get("consentGroups.numberOfParticipants"), dbgap_xml_dict.get("consentGroups.numberOfParticipants"), dbgap_study_api_dict.get("consentGroups.numberOfParticipants"), "0")
    final_results_dict["consentCode"] = consent_code
    final_results_dict["consentGroups.generalResearchUse"] = True if access_management == "controlled" and "GRU" in consent_code else False
    final_results_dict["consentGroups.hmb"] = True if access_management == "controlled" and "HMB" in consent_code else False
    if purl_doid:
        final_results_dict["consentGroups.diseaseSpecificUse"] = purl_doid
    else:
        final_results_dict["consentGroups.diseaseSpecificUse"] = []
    final_results_dict["consentGroups.gs"] = consent_code if access_management == "controlled" and "GS-" in consent_code else None
    final_results_dict["consentGroups.poa"] = True if access_management == "controlled" and "POA" in consent_code else False
    final_results_dict["consentGroups.nmds"] = True if access_management == "controlled" and "NMDS" in consent_code else False
    final_results_dict["consentGroups.gso"] = True if access_management == "controlled" and "GSO" in consent_code else False
    final_results_dict["consentGroups.pub"] = True if access_management == "controlled" and "PUB" in consent_code else False 
    final_results_dict["consentGroups.col"] = True if access_management == "controlled" and "COL-" in consent_code else False
    final_results_dict["consentGroups.irb"] = True if access_management == "controlled" and "IRB" in consent_code else False
    final_results_dict["consentGroups.npu"] = True if access_management == "controlled" and "NPU" in consent_code else False
    final_results_dict["consentGroups.otherPrimary"] = consent_code if (consent_code and access_management == "controlled" and not (final_results_dict["consentGroups.generalResearchUse"] or final_results_dict["consentGroups.hmb"] or final_results_dict["consentGroups.diseaseSpecificUse"] or final_results_dict["consentGroups.gs"] or final_results_dict["consentGroups.poa"] or final_results_dict["consentGroups.nmds"] or final_results_dict["consentGroups.gso"] or final_results_dict["consentGroups.pub"] or final_results_dict["consentGroups.col"] or final_results_dict["consentGroups.irb"] or final_results_dict["consentGroups.npu"])) else None
    final_results_dict["consentGroups.otherSecondary"] = None
    final_results_dict["consentGroups.mor"] = None
    final_results_dict["consentGroups.morDate"] = None
    final_results_dict["consentGroups.dataLocation"] = "TDR Location"
    final_results_dict["consentGroups.url"] = "https://data.terra.bio/snapshots/" + snapshot_id
    final_results_dict["consentGroups.fileTypes.fileType"] = coalesce(terra_dict.get("consentGroups.fileTypes.fileType"), dbgap_fhir_dict.get("consentGroups.fileTypes.fileType"), dbgap_xml_dict.get("consentGroups.fileTypes.fileType"), dbgap_study_api_dict.get("consentGroups.fileTypes.fileType"))
    final_results_dict["consentGroups.fileTypes.functionalEquivalence"] = None
    final_results_dict["consortium"] = consortium

# Return results
return final_results_dict

# Step 2: Load Reviewed Metadata into DUOS

## Final Code

In [None]:
#############################################
## Functions
#############################################

def format_list(input_list, min_items):
    if input_list:
        if isinstance(input_list, list):
            return input_list
        elif isinstance(input_list, str):
            return format_list(ast.literal_eval(input_list), min_items)
        else:
            return []
    else:
        if min_items > 0:
            i = 0
            temp_list = []
            while i < min_items:
                temp_list.append("Unknown")
                i += 1
            return temp_list
        else:
            return []
    
def format_file_types(ft_list, fe):
    if ft_list:
        output_list = []
        formatted_ft_list = format_list(ft_list, 0)
        for ft in formatted_ft_list:
            ft_dict = {"fileType": ft}
            if fe:
                ft_dict["functionalEquivalence"] = fe
            else:
                ft_dict["functionalEquivalence"] = "Unknown"
            output_list.append(ft_dict)
        return output_list
    else:
        return []
    
def upload_to_duos(input_file, token, env, dac_id, study_upload_list):
    
    # Determine the target URL from the env variable
    if env == "prod":
        url = "https://consent.dsde-prod.broadinstitute.org"
    else:
        url = "https://consent.dsde-dev.broadinstitute.org"
    
    # Pull down specified file from the cloud
    results_log = []
    print(f"Downloading input file {input_file}...")
    try:
        input_df = pd.read_csv(input_file, delimiter = "\t", encoding='unicode_escape')
        input_df = input_df.astype(object).where(pd.notnull(input_df),None)
        input_df.fillna("",inplace=True)
        input_dict = input_df.to_dict(orient="records")
        results_log.append(["Input File Download", "Succeeded", ""])
    except Exception as e:
        msg = f"Error downloading input file ({input_file}): {str(e)}"
        results_log.append(["Input File Download", "Failed", msg])
        print(msg)
        return results_log

    # Pull a list of existing datasets and studies from DUOS and build lookup dicts
    print("Building study and dataset lookup dicts from DUOS...")
    try:
        datasets = requests.get(
            url=f"{url}/api/dataset/v2?asCustodian=false",
            headers={"Authorization": f"Bearer {token}"}
        ).json()
        study_lookup = {}
        full_dataset_lookup = {}
        dataset_lookup = {}
        for dataset_entry in datasets:
            base_name = ""
            dataset_name = ""
            dataset_id = dataset_entry.get("dataSetId")
            study_name = ""
            study_id = None
            if dataset_entry.get("name"):
                try:
                    base_name =  dataset_entry.get("name")
                    dataset_name = re.search(r'(.*)_[0-9]{8}_ANV[0-9]+_[0-9]{12}$', base_name).group(1)
                except:
                    base_name =  dataset_entry.get("name")
                    dataset_name = dataset_entry.get("name")
            anvil_dataset = False
            if dataset_entry.get("study"):
                if dataset_entry["study"].get("name"):
                    study_name = dataset_entry["study"].get("name")
                    study_id = dataset_entry["study"].get("studyId")
                if dataset_entry["study"].get("description"):
                    if "Platform: AnVIL" in dataset_entry["study"].get("description"):
                        anvil_dataset = True
            if study_name and study_name not in study_lookup.keys():
                study_lookup[study_name] = study_id
            if dataset_name:
                full_dataset_lookup[dataset_name] = {
                    "id": dataset_id,
                    "duos_identifier": dataset_entry.get("datasetIdentifier"),
                    "name": base_name,
                    "create_date": dataset_entry.get("createDate"),
                    "study_name": study_name,
                    "study_id": study_id,
                    "anvil_dataset": anvil_dataset,
                    "data_use": dataset_entry.get("dataUse")
                }
        for key, val in full_dataset_lookup.items():
            if val["anvil_dataset"] == True:
                dataset_lookup[key] = val
        results_log.append(["DUOS Study and Dataset Lookup Dict Creation", "Succeeded", ""])
    except Exception as e:
        msg = f"Error building study and dataset lookups: {str(e)}"
        results_log.append(["DUOS Study and Dataset Lookup Dict Creation", "Failed", msg])
        print(msg)
        return results_log
    
    # Parse and build DUOS schema for inputted file
    print("Parsing input file and formatting into DUOS schema...")
    try:
        # Determine data submitter id
        response = requests.get(
            url=f"{url}/api/user/me",
            headers={"Authorization": f"Bearer {token}"}
        ).json()
        data_submitter_id = response["userId"]
        # Build dictionary for upload
        upload_dict = {}
        consent_mismatch_list = []
        existing_dataset_cnt = 0
        new_dataset_cnt = 0
        for input_entry in input_dict:
            snapshot_id = input_entry["snapshot_id"]
            study_name = input_entry["studyName"]
            try:
                base_name = input_entry["consentGroups.consentGroupName"]
                consent_group_name = re.search(r'(.*)_[0-9]{8}_ANV[0-9]+_[0-9]{12}$', base_name).group(1)
            except:
                consent_group_name = input_entry["consentGroups.consentGroupName"]
            access_type = input_entry["consentGroups.accessManagement"]
            study_id = study_lookup.get(study_name)
            dataset = dataset_lookup.get(consent_group_name)
            dataset_id = ""
            dataset_id_str = ""
            if dataset:
                dataset_id_from_dataset = dataset["id"]
                study_id_from_dataset = dataset["study_id"]
                gru_from_dataset = dataset["data_use"].get("generalUse")
                gru_from_input = input_entry["consentGroups.generalResearchUse"]
                hmb_from_dataset = dataset["data_use"].get("hmbResearch")
                hmb_from_input = input_entry["consentGroups.hmb"]
                ds_from_dataset = dataset["data_use"].get("diseaseRestrictions")
                ds_from_input = format_list(input_entry["consentGroups.diseaseSpecificUse"], 0)
                if study_id_from_dataset == study_id:
                    dataset_id = dataset_id_from_dataset
                    dataset_id_str = f" ({dataset_id})"
                else:
                    dataset_id = None   
                print(f"Parsing and formatting metadata for snapshot {snapshot_id} from the input file. Target study is: {study_name}. Target consent group is: {consent_group_name}{dataset_id_str}")
                if input_entry["consentGroups.accessManagement"] == "controlled" and (gru_from_dataset != gru_from_input or hmb_from_dataset != hmb_from_input or ds_from_dataset != ds_from_input):
                    print("\tERROR: Mismatching consent information in existing DUOS record vs. proposed new record. Update the consent in the new record to match OR update the consent group name in the new record to create a new DUOS record.")
                    consent_mismatch_list.append(snapshot_id)
            else:
                print(f"Parsing and formatting metadata for snapshot {snapshot_id} from the input file. Target study is: {study_name}. Target consent group is: {consent_group_name}")
            # If this is an existing dataset in the specified existing study, provide limited consent group information (for updates only)
            if study_id and dataset_id:
                existing_dataset_cnt += 1
                consent_group_dict = {
                            "consentGroupName": consent_group_name,
                            "datasetId": dataset_id,
                            "numberOfParticipants": input_entry["consentGroups.numberOfParticipants"],
                            "dataLocation": input_entry["consentGroups.dataLocation"],
                            "url": input_entry["consentGroups.url"],
                            "fileTypes": []
                            #"fileTypes": format_file_types(input_entry["consentGroups.fileTypes.fileType"], input_entry["consentGroups.fileTypes.functionalEquivalence"]) --> Enumeration, exclude for now
                    }
            # If this is a new dataset that is open access, provide limited consent group information
            elif access_type == "open":
                new_dataset_cnt += 1
                consent_group_dict = {
                            "consentGroupName": consent_group_name,
                            "accessManagement": access_type,
                            "numberOfParticipants": input_entry["consentGroups.numberOfParticipants"],
                            "dataLocation": input_entry["consentGroups.dataLocation"],
                            "url": input_entry["consentGroups.url"],
                            "fileTypes": []
                            #"fileTypes": format_file_types(input_entry["consentGroups.fileTypes.fileType"], input_entry["consentGroups.fileTypes.functionalEquivalence"]) --> Enumeration, exclude for now
                    }
            # If this is a new dataset that is NOT open access, provide the full consent group information
            else:
                new_dataset_cnt += 1
                consent_group_dict = {
                            "consentGroupName": consent_group_name,
                            "dataAccessCommitteeId": dac_id,
                            "accessManagement": access_type,
                            "numberOfParticipants": input_entry["consentGroups.numberOfParticipants"],
                            "generalResearchUse": input_entry["consentGroups.generalResearchUse"],
                            "hmb": input_entry["consentGroups.hmb"],
                            "diseaseSpecificUse": format_list(input_entry["consentGroups.diseaseSpecificUse"], 0),
                            "gs": input_entry["consentGroups.gs"],
                            "poa": input_entry["consentGroups.poa"],
                            "nmds": input_entry["consentGroups.nmds"],
                            "gso": input_entry["consentGroups.gso"],
                            "pub": input_entry["consentGroups.pub"],
                            "col": input_entry["consentGroups.col"],
                            "irb": input_entry["consentGroups.irb"],
                            "npu": input_entry["consentGroups.npu"],
                            "otherPrimary": input_entry["consentGroups.otherPrimary"],
                            #"otherSecondary": input_entry["consentGroups.otherSecondary"], --> Excluding for now, per JL's request
                            #"mor": input_entry["consentGroups.mor"], --> Date formatting validation for morDate, exclude for now
                            #"morDate": input_entry["consentGroups.morDate"], --> Date formatting validation, exclude for now
                            "dataLocation": input_entry["consentGroups.dataLocation"],
                            "url": input_entry["consentGroups.url"],
                            "fileTypes": []
                            #"fileTypes": format_file_types(input_entry["consentGroups.fileTypes.fileType"], input_entry["consentGroups.fileTypes.functionalEquivalence"]) --> Enumeration, exclude for now
                    }
            study_dict = {}
            consent_group_list = []
            # If the study associated with the record is not already in the upload dictionary, create a new study dict and append the consent group dict
            if study_name not in upload_dict.keys():
                consent_group_list.append(consent_group_dict)
                study_dict = {
                    "studyName": study_name,
                    #"studyType": input_entry["studyType"], --> Enumeration, exclude for now
                    "studyDescription": input_entry["studyDescription"],
                    "dataTypes": format_list(input_entry["dataTypes"], 1),
                    "phenotypeIndication": input_entry["phenotypeIndication"],
                    "species": input_entry["species"],
                    "piName": input_entry["piName"] if input_entry["piName"] else "NA",
                    "dataSubmitterUserId": data_submitter_id,
                    "dataCustodianEmail": format_list(input_entry["dataCustodianEmail"], 0),
                    "publicVisibility": input_entry["publicVisibility"],
                    "nihAnvilUse": input_entry["nihAnvilUse"],
                    "submittingToAnvil": input_entry["submittingToAnvil"],
                    "dbGaPPhsID": input_entry["dbGaPPhsID"],
                    "dbGaPStudyRegistrationName": input_entry["studyName"],
                    #"embargoReleaseDate": input_entry["embargoReleaseDate"], --> Date formatting validation, exclude for now
                    "sequencingCenter": input_entry["sequencingCenter"],
                    "piEmail": input_entry["piEmail"],
                    #"piInstitution": input_entry["piInstitution"], --> Integer ID for registered institutions, exclude for now
                    "piInstitution": 0,
                    "nihGrantContractNumber": "Unknown", # Required currently
                    "nihICsSupportingStudy": format_list(input_entry["nihICsSupportingStudy"], 0),
                    "nihProgramOfficerName": input_entry["nihProgramOfficerName"],
                    "nihInstitutionCenterSubmission": input_entry["nihInstitutionCenterSubmission"],
                    "nihInstitutionalCertificationFileName": input_entry["nihInstitutionalCertificationFileName"],
                    "nihGenomicProgramAdministratorName": input_entry["nihGenomicProgramAdministratorName"],
                    "collaboratingSites": format_list(input_entry["collaboratingSites"], 0),
                    "alternativeDataSharingPlan": input_entry["alternativeDataSharingPlan"],
                    "alternativeDataSharingPlanExplanation": input_entry["alternativeDataSharingPlanExplanation"],
                    "alternativeDataSharingPlanReasons": ["Other"] if input_entry["alternativeDataSharingPlan"] == True and input_entry["alternativeDataSharingPlanReasons"] == "[]" else format_list(input_entry["alternativeDataSharingPlanReasons"], 0), 
                    "consentGroups": consent_group_list
                }
                upload_dict[study_name] = study_dict
            # If the study is already in the upload dictionary, create an updated study dict and extend its list of consent groups
            else:
                for consent_group in upload_dict[study_name]["consentGroups"]:
                    if consent_group["consentGroupName"] != consent_group_dict["consentGroupName"]:
                        consent_group_list.append(consent_group)
                consent_group_list.append(consent_group_dict)
                study_dict = {
                    "studyName": study_name,
                    #"studyType": upload_dict[study_name]["studyType"], --> Enumeration, exclude for now
                    "studyDescription": upload_dict[study_name]["studyDescription"],
                    "dataTypes": upload_dict[study_name]["dataTypes"],
                    "phenotypeIndication": upload_dict[study_name]["phenotypeIndication"],
                    "species": upload_dict[study_name]["species"],
                    "piName": upload_dict[study_name]["piName"] if upload_dict[study_name]["piName"] else "NA",
                    "dataSubmitterUserId": upload_dict[study_name]["dataSubmitterUserId"],
                    "dataCustodianEmail": upload_dict[study_name]["dataCustodianEmail"],
                    "publicVisibility": upload_dict[study_name]["publicVisibility"],
                    "nihAnvilUse": upload_dict[study_name]["nihAnvilUse"],
                    "submittingToAnvil": upload_dict[study_name]["submittingToAnvil"],
                    "dbGaPPhsID": upload_dict[study_name]["dbGaPPhsID"],
                    "dbGaPStudyRegistrationName": upload_dict[study_name]["studyName"],
                    #"embargoReleaseDate": upload_dict[study_name]["embargoReleaseDate"], --> Date formatting validation, exclude for now
                    "sequencingCenter": upload_dict[study_name]["sequencingCenter"],
                    "piEmail": upload_dict[study_name]["piEmail"],
                    #"piInstitution": upload_dict[study_name]["piInstitution"], --> Integer ID for registered institutions, exclude for now
                    "piInstitution": upload_dict[study_name]["piInstitution"],
                    "nihGrantContractNumber": upload_dict[study_name]["nihGrantContractNumber"],
                    "nihICsSupportingStudy": upload_dict[study_name]["nihICsSupportingStudy"],
                    "nihProgramOfficerName": upload_dict[study_name]["nihProgramOfficerName"],
                    "nihInstitutionCenterSubmission": upload_dict[study_name]["nihInstitutionCenterSubmission"],
                    "nihInstitutionalCertificationFileName": upload_dict[study_name]["nihInstitutionalCertificationFileName"],
                    "nihGenomicProgramAdministratorName": upload_dict[study_name]["nihGenomicProgramAdministratorName"],
                    "collaboratingSites": upload_dict[study_name]["collaboratingSites"],
                    "alternativeDataSharingPlan": upload_dict[study_name]["alternativeDataSharingPlan"],
                    "alternativeDataSharingPlanExplanation": upload_dict[study_name]["alternativeDataSharingPlanExplanation"],
                    "alternativeDataSharingPlanReasons": upload_dict[study_name]["alternativeDataSharingPlanReasons"],
                    "consentGroups": consent_group_list
                }
                upload_dict[study_name] = study_dict
        if consent_mismatch_list:
            consent_mismatch_str = ", ".join(consent_mismatch_list)
            results_log.append(["Input File Formatting", "Failed", f"Snapshots with mismatched consent with existing DUOS datasets: {consent_mismatch_str}"])
        else:
            msg = f"Input file formatting complete. Existing Datasets: {existing_dataset_cnt} New Datasets: {new_dataset_cnt}"
            print(msg)
            results_log.append(["Input File Formatting", "Succeeded", msg])
    except Exception as e:
        msg = f"Error parsing and formatting input file: {str(e)}"
        results_log.append(["Input File Formatting", "Failed", msg])
        print(msg)
        return results_log
    
    # Loop through studies and dataset to upload
    for study in upload_dict.keys():
        if study in study_upload_list or len(study_upload_list) == 0:
            print(f"Uploading data for study {study} into DUOS")
            # For studies that don't exist in DUOS, create a new study
            if not study_lookup.get(study):
                print("Study does NOT currently exist in DUOS. Creating new study and dataset records...")
                try:
                    new_study_response = requests.post(
                        url=f"{url}/api/dataset/v3",
                        headers={"Authorization": f"Bearer {token}"},
                        files = {
                            "dataset": json.dumps(upload_dict[study]),
                            "alternativeDataSharingPlan": "",
                            "consentGroups[0].nihInstitutionalCertificationFile": ""  
                        }
                    ).json()
                    if new_study_response.get("studyId"):
                        study_id = new_study_response["studyId"]
                        msg = f"Study registration succeeded! Study Id: {study_id}"
                        results_log.append([f"New Study Registration - {study}", "Succeeded", msg])
                        print(msg)
                    else:
                        err_msg = new_study_response["message"]
                        msg = f"Study registration failed: {err_msg}"
                        results_log.append([f"New Study Registration - {study}", "Failed", msg])
                        print(msg)
                except Exception as e:
                    msg = f"Study registration failed: {str(e)}"
                    results_log.append([f"New Study Registration - {study}", "Failed", msg])
                    print(msg)

            # For studies that already exist in DUOS, update the existing study
            else:
                print("Study currently exists in DUOS. Updating study and dataset records...")
                # Identify existing datasets for that are not present in the upload dict
                study_id = study_lookup.get(study)
                study_details = requests.get(
                        url=f"{url}/api/dataset/study/{study_id}",
                        headers={"Authorization": f"Bearer {token}"}
                    ).json()
                study_datasets_in_duos = set(study_details.get("datasetIds"))
                study_datasets_in_input = set()
                for datasets in upload_dict[study]["consentGroups"]:
                    if datasets.get("datasetId"):
                        study_datasets_in_input.add(datasets.get("datasetId"))
                study_datasets_diff = study_datasets_in_duos.difference(study_datasets_in_input)
                # Add missing datasets to the upload dict
                temp_cg = upload_dict[study]["consentGroups"].copy()
                for missing_dataset_id in study_datasets_diff:
                    dataset_details = requests.get(
                        url=f"{url}/api/dataset/v2/{missing_dataset_id}",
                        headers={"Authorization": f"Bearer {token}"}
                    ).json()
                    name = dataset_details["name"]
                    data_loc = ""
                    data_loc_url = ""
                    num_participants = 0
                    for prop_entry in dataset_details["properties"]:
                        if prop_entry["propertyName"] == "Data Location":
                            data_loc = prop_entry["propertyValue"]
    #                     elif prop_entry["propertyName"] == "URL":  # Nulling out URLs for snapshots not in the release set
    #                         data_loc_url = prop_entry["propertyValue"]
                        elif prop_entry["propertyName"] == "# of participants":
                            num_participants = prop_entry["propertyValue"]
                    consent_group_dict = {
                        "consentGroupName": dataset_details["name"],
                        "datasetId": missing_dataset_id,
                        "numberOfParticipants": num_participants,
                        "dataLocation": data_loc,
                        "url": data_loc_url,
                        "fileTypes": []
                    }
                    temp_cg.append(consent_group_dict)
                upload_dict[study]["consentGroups"] = temp_cg
                try:
                    # Update study in DUOS
                    update_study_response = requests.put(
                        url=f"{url}/api/dataset/study/{study_id}",
                        headers={"Authorization": f"Bearer {token}"},
                        files = {
                            "dataset": json.dumps(upload_dict[study]),
                            "alternativeDataSharingPlan": "",
                            "consentGroups[0].nihInstitutionalCertificationFile": ""  
                        }
                    ).json()   
                    if update_study_response.get("studyId"):
                        study_id = update_study_response["studyId"]
                        msg = f"Study registration succeeded! Study Id: {study_id}"
                        results_log.append([f"New Study Registration - {study}", "Succeeded", msg])
                        print(msg)
                    else:
                        err_msg = update_study_response["message"]
                        msg = f"Study registration failed: {err_msg}"
                        results_log.append([f"New Study Registration - {study}", "Failed", msg])
                        print(msg)
                except Exception as e:
                    msg = f"Study registration failed: {str(e)}"
                    results_log.append([f"Study Registration Update - {study}", "Failed", msg])
                    print(msg)
    
    # Return results
    return results_log


#############################################
## Input Parameters
#############################################

# Cloud path to file to process
input_file_gcs_path = "gs://fc-2a9eefc3-0302-427f-9ac3-82f078741c03/dataset_metadata/anvil_dataset_metadata_20240724.tsv"

# User token (use gcloud auth print-access-token to get this)
token = ""

# Environment
env = "dev"

# Target DAC identifier
dac_id = 3

# Study Upload List (to limit the studies upload, leave empty for all)
study_upload_list = []

#############################################
## Execution
#############################################

upload_results = upload_to_duos(input_file_gcs_path, token, env, dac_id, study_upload_list)
df_results = pd.DataFrame(upload_results, columns = ["Item", "Status", "Message"])
print("\nUpload Results:")
display(df_results)


## Step Through Code (Testing)

### Specify Inputs

In [None]:
# INPUTS
#input_file_gcs_path = "gs://fc-2a9eefc3-0302-427f-9ac3-82f078741c03/dataset_metadata/test/initial_metadata_ingest.txt"
#input_file_gcs_path = "gs://fc-2a9eefc3-0302-427f-9ac3-82f078741c03/dataset_metadata/test/updated_metadata_ingest.txt"
#input_file_gcs_path = "gs://fc-2a9eefc3-0302-427f-9ac3-82f078741c03/dataset_metadata/test/updated_metadata_ingest_2.txt"
#input_file_gcs_path = "gs://fc-2a9eefc3-0302-427f-9ac3-82f078741c03/dataset_metadata/test/updated_metadata_ingest_3.txt"
input_file_gcs_path = "gs://fc-2a9eefc3-0302-427f-9ac3-82f078741c03/dataset_metadata/anvil_dataset_metadata_20240724.tsv"
token = ""
env = "dev"
dac_id = 3

# Determine the target URL from the env variable
if env == "prod":
    url = "https://consent.dsde-prod.broadinstitute.org"
else:
    url = "https://consent.dsde-dev.broadinstitute.org"
    
# Create results log
results_log = []

# FUNCTIONS
def format_list(input_list, min_items):
    if input_list:
        if isinstance(input_list, list):
            return input_list
        elif isinstance(input_list, str):
            return format_list(ast.literal_eval(input_list), min_items)
        else:
            return []
    else:
        if min_items > 0:
            i = 0
            temp_list = []
            while i < min_items:
                temp_list.append("Unknown")
                i += 1
            return temp_list
        else:
            return []
    
def format_file_types(ft_list, fe):
    if ft_list:
        output_list = []
        formatted_ft_list = format_list(ft_list, 0)
        for ft in formatted_ft_list:
            ft_dict = {"fileType": ft}
            if fe:
                ft_dict["functionalEquivalence"] = fe
            else:
                ft_dict["functionalEquivalence"] = "Unknown"
            output_list.append(ft_dict)
        return output_list
    else:
        return []

### Read in input file

In [None]:
# Pull down specified file from the cloud
print(f"Downloading input file {input_file_gcs_path}...")
try:
    input_df = pd.read_csv(input_file_gcs_path, delimiter = "\t", encoding='unicode_escape')
    input_df = input_df.astype(object).where(pd.notnull(input_df),None)
    input_df.fillna("",inplace=True)
    input_dict = input_df.to_dict(orient="records")
    results_log.append(["Input File Download", "Succeeded", ""])
except Exception as e:
    msg = f"Error downloading input file ({input_file_gcs_path}): {str(e)}"
    results_log.append(["Input File Download", "Failed", msg])
    print(msg)
    #return results_log

In [None]:
study_name_set = set()
for entry in input_dict:
    study_name_set.add(entry["studyName"])

In [None]:
study_name_set

### Read in existing DUOS data

In [None]:
# Pull a list of existing datasets and studies from DUOS and build lookup dicts
print("Building study and dataset lookup dicts from DUOS...")
try:
    datasets = requests.get(
        url=f"{url}/api/dataset/v2?asCustodian=false",
        headers={"Authorization": f"Bearer {token}"}
    ).json()
    study_lookup = {}
    full_dataset_lookup = {}
    dataset_lookup = {}
    for dataset_entry in datasets:
        base_name = ""
        dataset_name = ""
        dataset_id = dataset_entry.get("dataSetId")
        study_name = ""
        study_id = None
        if dataset_entry.get("name"):
            try:
                base_name =  dataset_entry.get("name")
                dataset_name = re.search(r'(.*)_[0-9]{8}_ANV[0-9]+_[0-9]{12}$', base_name).group(1)
            except:
                base_name =  dataset_entry.get("name")
                dataset_name = dataset_entry.get("name")
        anvil_dataset = False
        if dataset_entry.get("study"):
            if dataset_entry["study"].get("name"):
                study_name = dataset_entry["study"].get("name")
                study_id = dataset_entry["study"].get("studyId")
            if dataset_entry["study"].get("description"):
                if "Platform: AnVIL" in dataset_entry["study"].get("description"):
                    anvil_dataset = True
        if study_name and study_name not in study_lookup.keys():
            study_lookup[study_name] = study_id
        if dataset_name:
            full_dataset_lookup[dataset_name] = {
                "id": dataset_id,
                "duos_identifier": dataset_entry.get("datasetIdentifier"),
                "name": base_name,
                "create_date": dataset_entry.get("createDate"),
                "study_name": study_name,
                "study_id": study_id,
                "anvil_dataset": anvil_dataset,
                "data_use": dataset_entry.get("dataUse")
            }
    for key, val in full_dataset_lookup.items():
        if val["anvil_dataset"] == True:
            dataset_lookup[key] = val
    results_log.append(["DUOS Study and Dataset Lookup Dict Creation", "Succeeded", ""])
except Exception as e:
    msg = f"Error building study and dataset lookups: {str(e)}"
    results_log.append(["DUOS Study and Dataset Lookup Dict Creation", "Failed", msg])
    print(msg)
    #return results_log

In [None]:
study_lookup

In [None]:
dataset_lookup["ANVIL_GTEx_V8_hg38"]

### Build input data for DUOS from input file

In [None]:
# Parse and build DUOS schema for inputted file
print("Parsing input file and formatting into DUOS schema...")
try:
    # Determine data submitter id
    response = requests.get(
        url=f"{url}/api/user/me",
        headers={"Authorization": f"Bearer {token}"}
    ).json()
    data_submitter_id = response["userId"]
    # Build dictionary for upload
    upload_dict = {}
    consent_mismatch_list = []
    existing_dataset_cnt = 0
    new_dataset_cnt = 0
    for input_entry in input_dict:
        snapshot_id = input_entry["snapshot_id"]
        study_name = input_entry["studyName"]
        try:
            base_name = input_entry["consentGroups.consentGroupName"]
            consent_group_name = re.search(r'(.*)_[0-9]{8}_ANV[0-9]+_[0-9]{12}$', base_name).group(1)
        except:
            consent_group_name = input_entry["consentGroups.consentGroupName"]
        access_type = input_entry["consentGroups.accessManagement"]
        study_id = study_lookup.get(study_name)
        dataset = dataset_lookup.get(consent_group_name)
        dataset_id = ""
        dataset_id_str = ""
        if dataset:
            dataset_id_from_dataset = dataset["id"]
            study_id_from_dataset = dataset["study_id"]
            gru_from_dataset = dataset["data_use"].get("generalUse")
            gru_from_input = input_entry["consentGroups.generalResearchUse"]
            hmb_from_dataset = dataset["data_use"].get("hmbResearch")
            hmb_from_input = input_entry["consentGroups.hmb"]
            ds_from_dataset = dataset["data_use"].get("diseaseRestrictions")
            ds_from_input = format_list(input_entry["consentGroups.diseaseSpecificUse"], 0)
            if study_id_from_dataset == study_id:
                dataset_id = dataset_id_from_dataset
                dataset_id_str = f" ({dataset_id})"
            else:
                dataset_id = None   
            print(f"Parsing and formatting metadata for snapshot {snapshot_id} from the input file. Target study is: {study_name}. Target consent group is: {consent_group_name}{dataset_id_str}")
            if input_entry["consentGroups.accessManagement"] == "controlled" and (gru_from_dataset != gru_from_input or hmb_from_dataset != hmb_from_input or ds_from_dataset != ds_from_input):
                print("\tERROR: Mismatching consent information in existing DUOS record vs. proposed new record. Update the consent in the new record to match OR update the consent group name in the new record to create a new DUOS record.")
                consent_mismatch_list.append(snapshot_id)
        else:
            print(f"Parsing and formatting metadata for snapshot {snapshot_id} from the input file. Target study is: {study_name}. Target consent group is: {consent_group_name}")
        # If this is an existing dataset in the specified existing study, provide limited consent group information (for updates only)
        if study_id and dataset_id:
            existing_dataset_cnt += 1
            consent_group_dict = {
                        "consentGroupName": consent_group_name,
                        "datasetId": dataset_id,
                        "numberOfParticipants": input_entry["consentGroups.numberOfParticipants"],
                        "dataLocation": input_entry["consentGroups.dataLocation"],
                        "url": input_entry["consentGroups.url"],
                        "fileTypes": []
                        #"fileTypes": format_file_types(input_entry["consentGroups.fileTypes.fileType"], input_entry["consentGroups.fileTypes.functionalEquivalence"]) --> Enumeration, exclude for now
                }
        # If this is a new dataset that is open access, provide limited consent group information
        elif access_type == "open":
            new_dataset_cnt += 1
            consent_group_dict = {
                        "consentGroupName": consent_group_name,
                        "accessManagement": access_type,
                        "numberOfParticipants": input_entry["consentGroups.numberOfParticipants"],
                        "dataLocation": input_entry["consentGroups.dataLocation"],
                        "url": input_entry["consentGroups.url"],
                        "fileTypes": []
                        #"fileTypes": format_file_types(input_entry["consentGroups.fileTypes.fileType"], input_entry["consentGroups.fileTypes.functionalEquivalence"]) --> Enumeration, exclude for now
                }
        # If this is a new dataset that is NOT open access, provide the full consent group information
        else:
            new_dataset_cnt += 1
            consent_group_dict = {
                        "consentGroupName": consent_group_name,
                        "dataAccessCommitteeId": dac_id,
                        "accessManagement": access_type,
                        "numberOfParticipants": input_entry["consentGroups.numberOfParticipants"],
                        "generalResearchUse": input_entry["consentGroups.generalResearchUse"],
                        "hmb": input_entry["consentGroups.hmb"],
                        "diseaseSpecificUse": format_list(input_entry["consentGroups.diseaseSpecificUse"], 0),
                        "gs": input_entry["consentGroups.gs"],
                        "poa": input_entry["consentGroups.poa"],
                        "nmds": input_entry["consentGroups.nmds"],
                        "gso": input_entry["consentGroups.gso"],
                        "pub": input_entry["consentGroups.pub"],
                        "col": input_entry["consentGroups.col"],
                        "irb": input_entry["consentGroups.irb"],
                        "npu": input_entry["consentGroups.npu"],
                        "otherPrimary": input_entry["consentGroups.otherPrimary"],
                        #"otherSecondary": input_entry["consentGroups.otherSecondary"], --> Excluding for now, per JL's request
                        #"mor": input_entry["consentGroups.mor"], --> Date formatting validation for morDate, exclude for now
                        #"morDate": input_entry["consentGroups.morDate"], --> Date formatting validation, exclude for now
                        "dataLocation": input_entry["consentGroups.dataLocation"],
                        "url": input_entry["consentGroups.url"],
                        "fileTypes": []
                        #"fileTypes": format_file_types(input_entry["consentGroups.fileTypes.fileType"], input_entry["consentGroups.fileTypes.functionalEquivalence"]) --> Enumeration, exclude for now
                }
        study_dict = {}
        consent_group_list = []
        # If the study associated with the record is not already in the upload dictionary, create a new study dict and append the consent group dict
        if study_name not in upload_dict.keys():
            consent_group_list.append(consent_group_dict)
            study_dict = {
                "studyName": study_name,
                #"studyType": input_entry["studyType"], --> Enumeration, exclude for now
                "studyDescription": input_entry["studyDescription"],
                "dataTypes": format_list(input_entry["dataTypes"], 1),
                "phenotypeIndication": input_entry["phenotypeIndication"],
                "species": input_entry["species"],
                "piName": input_entry["piName"] if input_entry["piName"] else "NA",
                "dataSubmitterUserId": data_submitter_id,
                "dataCustodianEmail": format_list(input_entry["dataCustodianEmail"], 0),
                "publicVisibility": input_entry["publicVisibility"],
                "nihAnvilUse": input_entry["nihAnvilUse"],
                "submittingToAnvil": input_entry["submittingToAnvil"],
                "dbGaPPhsID": input_entry["dbGaPPhsID"],
                "dbGaPStudyRegistrationName": input_entry["studyName"],
                #"embargoReleaseDate": input_entry["embargoReleaseDate"], --> Date formatting validation, exclude for now
                "sequencingCenter": input_entry["sequencingCenter"],
                "piEmail": input_entry["piEmail"],
                #"piInstitution": input_entry["piInstitution"], --> Integer ID for registered institutions, exclude for now
                "piInstitution": 0,
                "nihGrantContractNumber": "Unknown", # Required currently
                "nihICsSupportingStudy": format_list(input_entry["nihICsSupportingStudy"], 0),
                "nihProgramOfficerName": input_entry["nihProgramOfficerName"],
                "nihInstitutionCenterSubmission": input_entry["nihInstitutionCenterSubmission"],
                "nihInstitutionalCertificationFileName": input_entry["nihInstitutionalCertificationFileName"],
                "nihGenomicProgramAdministratorName": input_entry["nihGenomicProgramAdministratorName"],
                "collaboratingSites": format_list(input_entry["collaboratingSites"], 0),
                "alternativeDataSharingPlan": input_entry["alternativeDataSharingPlan"],
                "alternativeDataSharingPlanExplanation": input_entry["alternativeDataSharingPlanExplanation"],
                "alternativeDataSharingPlanReasons": ["Other"] if input_entry["alternativeDataSharingPlan"] == True and input_entry["alternativeDataSharingPlanReasons"] == "[]" else format_list(input_entry["alternativeDataSharingPlanReasons"], 0), 
                "consentGroups": consent_group_list
            }
            upload_dict[study_name] = study_dict
        # If the study is already in the upload dictionary, create an updated study dict and extend its list of consent groups
        else:
            for consent_group in upload_dict[study_name]["consentGroups"]:
                if consent_group["consentGroupName"] != consent_group_dict["consentGroupName"]:
                    consent_group_list.append(consent_group)
            consent_group_list.append(consent_group_dict)
            study_dict = {
                "studyName": study_name,
                #"studyType": upload_dict[study_name]["studyType"], --> Enumeration, exclude for now
                "studyDescription": upload_dict[study_name]["studyDescription"],
                "dataTypes": upload_dict[study_name]["dataTypes"],
                "phenotypeIndication": upload_dict[study_name]["phenotypeIndication"],
                "species": upload_dict[study_name]["species"],
                "piName": upload_dict[study_name]["piName"] if upload_dict[study_name]["piName"] else "NA",
                "dataSubmitterUserId": upload_dict[study_name]["dataSubmitterUserId"],
                "dataCustodianEmail": upload_dict[study_name]["dataCustodianEmail"],
                "publicVisibility": upload_dict[study_name]["publicVisibility"],
                "nihAnvilUse": upload_dict[study_name]["nihAnvilUse"],
                "submittingToAnvil": upload_dict[study_name]["submittingToAnvil"],
                "dbGaPPhsID": upload_dict[study_name]["dbGaPPhsID"],
                "dbGaPStudyRegistrationName": upload_dict[study_name]["studyName"],
                #"embargoReleaseDate": upload_dict[study_name]["embargoReleaseDate"], --> Date formatting validation, exclude for now
                "sequencingCenter": upload_dict[study_name]["sequencingCenter"],
                "piEmail": upload_dict[study_name]["piEmail"],
                #"piInstitution": upload_dict[study_name]["piInstitution"], --> Integer ID for registered institutions, exclude for now
                "piInstitution": upload_dict[study_name]["piInstitution"],
                "nihGrantContractNumber": upload_dict[study_name]["nihGrantContractNumber"],
                "nihICsSupportingStudy": upload_dict[study_name]["nihICsSupportingStudy"],
                "nihProgramOfficerName": upload_dict[study_name]["nihProgramOfficerName"],
                "nihInstitutionCenterSubmission": upload_dict[study_name]["nihInstitutionCenterSubmission"],
                "nihInstitutionalCertificationFileName": upload_dict[study_name]["nihInstitutionalCertificationFileName"],
                "nihGenomicProgramAdministratorName": upload_dict[study_name]["nihGenomicProgramAdministratorName"],
                "collaboratingSites": upload_dict[study_name]["collaboratingSites"],
                "alternativeDataSharingPlan": upload_dict[study_name]["alternativeDataSharingPlan"],
                "alternativeDataSharingPlanExplanation": upload_dict[study_name]["alternativeDataSharingPlanExplanation"],
                "alternativeDataSharingPlanReasons": upload_dict[study_name]["alternativeDataSharingPlanReasons"],
                "consentGroups": consent_group_list
            }
            upload_dict[study_name] = study_dict
    if consent_mismatch_list:
        consent_mismatch_str = ", ".join(consent_mismatch_list)
        results_log.append(["Input File Formatting", "Failed", f"Snapshots with mismatched consent with existing DUOS datasets: {consent_mismatch_str}"])
    else:
        msg = f"Input file formatting complete. Existing Datasets: {existing_dataset_cnt} New Datasets: {new_dataset_cnt}"
        print(msg)
        results_log.append(["Input File Formatting", "Succeeded", msg])
except Exception as e:
    msg = f"Error parsing and formatting input file: {str(e)}"
    results_log.append(["Input File Formatting", "Failed", msg])
    print(msg)
    #return results_log

In [None]:
upload_dict["Genotype-Tissue Expression (GTEx) (phs000424)"]

### Upload to DUOS

In [None]:
study_upload_list = [
]

# Loop through studies and dataset to upload
for study in upload_dict.keys():
    if study in study_upload_list or len(study_upload_list) == 0:
        print(f"Uploading data for study {study} into DUOS")
        # For studies that don't exist in DUOS, create a new study
        if not study_lookup.get(study):
            print("Study does NOT currently exist in DUOS. Creating new study and dataset records...")
            try:
                new_study_response = requests.post(
                    url=f"{url}/api/dataset/v3",
                    headers={"Authorization": f"Bearer {token}"},
                    files = {
                        "dataset": json.dumps(upload_dict[study]),
                        "alternativeDataSharingPlan": "",
                        "consentGroups[0].nihInstitutionalCertificationFile": ""  
                    }
                ).json()
                if new_study_response.get("studyId"):
                    study_id = new_study_response["studyId"]
                    msg = f"Study registration succeeded! Study Id: {study_id}"
                    results_log.append([f"New Study Registration - {study}", "Succeeded", msg])
                    print(msg)
                else:
                    err_msg = new_study_response["message"]
                    msg = f"Study registration failed: {err_msg}"
                    results_log.append([f"New Study Registration - {study}", "Failed", msg])
                    print(msg)
            except Exception as e:
                msg = f"Study registration failed: {str(e)}"
                results_log.append([f"New Study Registration - {study}", "Failed", msg])
                print(msg)

        # For studies that already exist in DUOS, update the existing study
        else:
            print("Study currently exists in DUOS. Updating study and dataset records...")
            # Identify existing datasets for that are not present in the upload dict
            study_id = study_lookup.get(study)
            study_details = requests.get(
                    url=f"{url}/api/dataset/study/{study_id}",
                    headers={"Authorization": f"Bearer {token}"}
                ).json()
            study_datasets_in_duos = set(study_details.get("datasetIds"))
            study_datasets_in_input = set()
            for datasets in upload_dict[study]["consentGroups"]:
                if datasets.get("datasetId"):
                    study_datasets_in_input.add(datasets.get("datasetId"))
            study_datasets_diff = study_datasets_in_duos.difference(study_datasets_in_input)
            # Add missing datasets to the upload dict
            temp_cg = upload_dict[study]["consentGroups"].copy()
            for missing_dataset_id in study_datasets_diff:
                dataset_details = requests.get(
                    url=f"{url}/api/dataset/v2/{missing_dataset_id}",
                    headers={"Authorization": f"Bearer {token}"}
                ).json()
                name = dataset_details["name"]
                data_loc = ""
                data_loc_url = ""
                num_participants = 0
                for prop_entry in dataset_details["properties"]:
                    if prop_entry["propertyName"] == "Data Location":
                        data_loc = prop_entry["propertyValue"]
#                     elif prop_entry["propertyName"] == "URL":  # Nulling out URLs for snapshots not in the release set
#                         data_loc_url = prop_entry["propertyValue"]
                    elif prop_entry["propertyName"] == "# of participants":
                        num_participants = prop_entry["propertyValue"]
                consent_group_dict = {
                    "consentGroupName": dataset_details["name"],
                    "datasetId": missing_dataset_id,
                    "numberOfParticipants": num_participants,
                    "dataLocation": data_loc,
                    "url": data_loc_url,
                    "fileTypes": []
                }
                temp_cg.append(consent_group_dict)
            upload_dict[study]["consentGroups"] = temp_cg
            try:
                # Update study in DUOS
                update_study_response = requests.put(
                    url=f"{url}/api/dataset/study/{study_id}",
                    headers={"Authorization": f"Bearer {token}"},
                    files = {
                        "dataset": json.dumps(upload_dict[study]),
                        "alternativeDataSharingPlan": "",
                        "consentGroups[0].nihInstitutionalCertificationFile": ""  
                    }
                ).json()   
                if update_study_response.get("studyId"):
                    study_id = update_study_response["studyId"]
                    msg = f"Study registration succeeded! Study Id: {study_id}"
                    results_log.append([f"New Study Registration - {study}", "Succeeded", msg])
                    print(msg)
                else:
                    err_msg = update_study_response["message"]
                    msg = f"Study registration failed: {err_msg}"
                    results_log.append([f"New Study Registration - {study}", "Failed", msg])
                    print(msg)
            except Exception as e:
                msg = f"Study registration failed: {str(e)}"
                results_log.append([f"Study Registration Update - {study}", "Failed", msg])
                print(msg)

In [None]:
update_study_response

### Verify the output in DUOS

In [None]:
# Pull a existing dataset from DUOS and analyze
datasets = requests.get(
    url=f"{url}/api/dataset/v2?asCustodian=false",
    headers={"Authorization": f"Bearer {token}"}
).json()
for dataset_entry in datasets:
    dataset_name = dataset_entry.get("datasetName")
    study_name = ""
    tdr_url = ""
    if dataset_name and "ANVIL_DSU" in dataset_name and "V2" in dataset_name:
        if dataset_entry.get("study") and dataset_entry["study"].get("name"):
            study_name = dataset_entry["study"].get("name") 
        properties = dataset_entry.get("properties")
        if properties:
            for prop_entry in properties:
                if prop_entry["propertyName"] == "URL":
                    tdr_url = prop_entry["propertyValue"]
        print(f"Study Name: {study_name}")
        print(f"Consent Group Name: {dataset_name}")
        print(f"TDR URL: {tdr_url}")
        print("----------------------------------------------------------------------")

# Step 3: Attach DUOS IDs to Snapshots

## Add DUOS IDs Based on Snapshot Listed in DUOS

In [None]:
#############################################
## Functions
#############################################

def link_duos_ids_to_snapshots(snapshot_id_list, env, token):
    results_log = []

    # Determine the target URL from the env variable
    if env == "prod":
        url = "https://consent.dsde-prod.broadinstitute.org"
    else:
        url = "https://consent.dsde-dev.broadinstitute.org"

    # Pull a list of existing datasets and studies from DUOS and build lookup dicts
    print("Building lookup between Snapshot and DUOS ID...")
    try:
        datasets = requests.get(
            url=f"{url}/api/dataset/v2?asCustodian=false",
            headers={"Authorization": f"Bearer {token}"}
        ).json()
        snapshot_lookup = {}
        for dataset_entry in datasets:
            try:
                url = ""
                snapshot = False
                for prop_entry in dataset_entry["properties"]:
                    if prop_entry["propertyName"] == "URL":
                        url = prop_entry["propertyValue"]
                    elif prop_entry["propertyName"] == "Data Location" and prop_entry["propertyValue"] == "TDR Location":
                        snapshot = True
                if snapshot == True:
                    snapshot_id = re.search("([a-z0-9]{8}-[a-z0-9]{4}-[a-z0-9]{4}-[a-z0-9]{4}-[a-z0-9]{12})", url, re.IGNORECASE).group(1)
                    duos_id = dataset_entry["datasetIdentifier"]
                    snapshot_lookup[snapshot_id] = duos_id
            except:
                pass
        results_log.append(["Snapshot Lookup Creation", "Success", ""])
    except Exception as e:
        msg = f"Error building lookup between Snapshot and DUOS ID: {str(e)}"
        results_log.append(["Snapshot Lookup Creation", "Failed", msg])
        print(msg)
        return results_log

    # Loop through input snapshots and link DUOS IDs to them
    print("Linking DUOS IDs to Snapshots...")
    creds, project = google.auth.default()
    auth_req = google.auth.transport.requests.Request()
    creds.refresh(auth_req)
    api_client = refresh_tdr_api_client()
    snapshots_api = data_repo_client.SnapshotsApi(api_client=api_client)
    duos_api = data_repo_client.DuosApi(api_client=api_client)
    for snapshot_id in snapshot_id_list:
        print(f"\tProcessing snapshot ID = {snapshot_id}")
        duos_id = snapshot_lookup.get(snapshot_id)
        if duos_id:
            # Link the DUOS ID to the snapshot
            print(f"\t\t- Linking DUOS ID {duos_id} to snapshot.")
            attempt_counter = 0
            while attempt_counter <= 2:
                try:
                    response = snapshots_api.link_duos_dataset_to_snapshot(id=snapshot_id, duos_id=duos_id).to_dict()
                    if response.get("linked"):
                        results_log.append([f"DUOS ID to Snapshot Linkage ({snapshot_id} - {duos_id})", "Success", ""])
                        break
                    elif response.get("message"):
                        response_message = response.get("message")
                        msg = f"Error linking DUOS ID to Snapshot: {response_message}"
                        if attempt_counter >= 2:
                            results_log.append([f"DUOS ID to Snapshot Linkage ({snapshot_id} - {duos_id})", "Failed", msg])
                            break
                except Exception as e:
                    msg = f"Error linking DUOS ID to Snapshot: {str(e)}"
                    if attempt_counter >= 2:
                        results_log.append([f"DUOS ID to Snapshot Linkage ({snapshot_id} - {duos_id})", "Failed", msg])
                    sleep(5)
                    attempt_counter += 1  
            
            # Fetch the DUOS user group associated with the DUOS ID
            print(f"\t\t- Fetching DUOS user group from DUOS ID {duos_id}.")
            duos_group = ""
            attempt_counter = 0
            while attempt_counter <= 2:
                try:  
                    response = duos_api.retrieve_duos_firecloud_group(duos_id=duos_id).to_dict()
                    duos_group = response["firecloud_group_email"]
                    results_log.append([f"DUOS User Group Fetching ({duos_id})", "Success", ""])
                    break
                except Exception as e:
                    msg = f"Error fetching DUOS user group for DUOS ID {duos_id}: {str(e)}"
                    if attempt_counter >= 2:
                        results_log.append([f"DUOS User Group Fetching ({duos_id})", "Failed", msg])
                    sleep(5)
                    attempt_counter += 1 
            
            # Add the DUOS user group to any DAC groups on the snapshot
            print(f"\t\t- Adding DUOS user group {duos_group} to snapshot DAC user group(s).")
            dac_groups = []
            try:
                response = snapshots_api.retrieve_snapshot_policies(id=snapshot_id).to_dict()
                if response.get("auth_domain"):
                    dac_groups = response["auth_domain"]
                if dac_groups:
                    print(f"\t\t\t- DAC user group(s) found on snapshot: {dac_groups}.")
                    for dac_group in dac_groups:
                        response = requests.put(
                            url=f"https://api.firecloud.org/api/groups/{dac_group}/member/{duos_group}",
                            headers={"Authorization": f"Bearer {creds.token}"}
                        )
                        if response.status_code != 204:
                            results_log.append([f"DUOS Group to DAC Group Addition ({duos_group} - {dac_group})", "Failed", "Error adding DUOS group to DAC group."])
                        else:
                            results_log.append([f"DUOS Group to DAC Group Addition ({duos_group} - {dac_group})", "Success", ""])
                else:
                    msg = f"No DAC user group(s) found on snapshot."
                    print(f"\t\t\t- {msg}")
                    results_log.append([f"DUOS Group to DAC Group Addition ({snapshot_id} - {duos_id})", "Warning", msg])   
            except Exception as e:
                msg = f"Error adding DUOS Group to DAC Group: {str(e)}"
                results_log.append([f"DUOS Group to DAC Group Addition ({snapshot_id} - {duos_id})", "Failed", msg])
        
        else:
            results_log.append([f"DUOS ID to Snapshot Linkage ({snapshot_id})", "Failed", "No DUOS ID found for the snapshot."])
    return results_log

#############################################
## Input Parameters
#############################################

# User token (use gcloud auth print-access-token to get this)
token = ""

# Environment
env = "prod"

# Snapshot list
snapshot_id_list = [
    '737d454c-88be-477f-ae2c-ef473e2106ce',
    '253e2b36-1674-482b-bfbd-4e0b05cdfe63',
    '3f53e841-ca9d-4b55-b390-590718533561',
    '01cf2450-604b-43e5-9f4e-9ec4e0bf0a61',
    '85b0b351-cd0a-4efe-95a4-e39273c42831',
    'c9037419-367e-439c-a247-b0dae7c24146',
    'd7b2b2c6-72fd-4084-af34-a86edfe3ac47',
    'd63a63ce-24c8-413a-89c0-4bd4c82370c0',
    '1bb208f2-ecf3-4589-a9bd-b6e94178584d',
    '5773565d-ad7c-4f51-8b4f-f1ee5dffc08a',
    '2e5c5fe3-3af4-4c34-a85e-af6b4135f089',
    '27068295-b3c0-4260-9447-9ca96814d46f',
    '060c707a-2f0d-4730-bbd6-d25489abfcf6',
    '7e59197f-b859-4279-add3-de24bbc7e52b',
    '624fef99-e4ce-4c12-a3d9-90995b5da970',
    'a68d3145-81c2-41f8-9944-5e4a5058934a',
    'a3b18d45-96c2-4526-8fde-65ab3265868f',
    '3ec72891-87d2-431f-850c-e52013330ea8',
    '87d02347-d169-4ce0-9027-3c8e11e48c40',
    '61b6ae23-ca19-4d31-bad3-2281a8528886',
    '7c4edc65-bfe6-4ede-a68a-c0b9d2564f29',
    'f330517e-46fd-4de3-8063-015b524a7324',
    'f0d8bb27-1695-4faf-8b27-4b95260b8f17',
    '17d14df1-cb64-4aae-8049-c1728a3c0c81',
    '434f85e2-4435-483c-8099-b03c8ba794ed',
    '5bba97dc-d6ab-4329-912f-148c8b807056',
    '4c722626-c559-4f5a-84bd-8d7d46983e1e',
    '6df525e1-b143-4e6f-b667-80c783ae1b66',
    '079eb53c-e2b6-4da6-ab5f-fc2136a3ecc1',
    '1a26532c-16e6-4f1c-81f9-8f07a8181421',
    '3ac713b5-3645-4381-ac66-ecbc281a2ab8',
    '4911bd18-5db9-418a-9dc0-0ea28ae937d6',
    'bbd04481-0b9d-4c21-ba65-a43638116e0f',
    '2b78a3ac-8bca-4938-bc7c-26a60f9c04ac',
    '4bb891fc-fcae-40cc-bf59-73716de7e04e',
    '574e0d42-e712-4a86-be7a-4b3a95187bcd',
    '56078c29-a393-4c60-9e04-3674e02fe729',
    '099d2585-1379-4333-b3b1-ffc0d26d95c5',
    'ab71d294-4ba9-44d4-8051-913b3d5ccff3',
    '90fe2016-e79c-456c-a5f9-3a31149fcd65',
    'e43974fd-cee1-4d8c-a436-6846d7d24129',
    '0d607d21-c9c7-4852-83e3-76825176ee0a',
    '0a356156-961d-4829-b9b5-c07fbc73dacc',
    '18a28450-31ec-4e4a-a305-dbbdd226ae3c',
    'f7d225d9-1675-483d-a1eb-9ef750301cd4',
    'd4b02f5f-7a62-4cad-8ffc-d3deb0fab445',
    '4c8ce027-8094-4f5d-bf62-22b1d51b3c1e',
    'c753046a-cf9b-4813-be68-cb3b9dd9866e',
    'eb7948be-1007-4b0e-b9b6-a5c40bbb9596',
    '7639a9e0-275c-49a8-80c1-cdb01ce23e1c',
    'aa2bfacc-c28c-4192-960c-b1389cf68516',
    'd7349942-f8ff-4ad6-b075-8f39652a7789',
    'b9e0de2a-4085-4226-a073-1744914cbbd4',
    '44b1f60b-e74c-4430-9378-d4a75e2de72f',
    'a4c62d7f-34f0-4e2e-9e46-c762d3ab0ff2',
    '6a5b3be6-d1de-4f23-a431-b08e7ab231b8',
    '5208772d-21f9-46b0-8167-0b05b57296b8',
    '36690013-e8bc-43a5-9ba9-83317537557c',
    '172bada7-f1c5-41c4-836d-05381beaed9a',
    '9a1e873b-b1db-4d3e-a83b-ed6c5b3f3ecc',
    '2c6de04e-104d-42c8-8448-97d74985dacb',
    '452bcafd-ab45-4e24-b5e0-13fcf22b0755',
    'fbafdd31-21a0-44c5-ae4d-724839beff61',
    '2a1882d9-88ca-4849-bcc1-f6914f593407',
    '3838993f-59ba-4dec-8110-ac3ea387ab91',
    'bf2f4106-cee9-419c-b4d1-d7b03a6293d5',
    'a6c36f5e-b86c-4164-85ae-8bf0df2e4a90',
    '11a7572f-02b9-4f88-8c2c-802dfb1f94b7',
    '5e547934-c339-410e-a013-dfefed50f4b8',
    'ffa84feb-ca0e-43d3-a04d-a402a8e24a3b',
    '2be072bd-2153-4050-9358-e4b95297a9bf',
    '7c19d852-e36a-4353-afea-10e501601d9a',
    'fd3843fe-ee5d-4784-b0d2-6673f9886d30',
    '84703c54-a9dd-400c-9701-2fc40922e3e3',
    '00297802-e20a-413f-b389-a6f764b6600e',
    'c853d4c0-d4be-433d-964e-e30bdc35480e',
    '3e85b06a-a6ea-4ce8-a655-44b1fce12138',
    '6e674477-522f-4adc-8c50-76910a6a282b',
    '504089f1-c59d-48fe-84ef-858bd3eb3043',
    '0565b2e4-ade1-46e7-80bf-ca647a89a8b8',
    '1cf943bc-9ffe-4fd0-a92d-6fdcf68da743',
    'bb11d621-e471-4ca9-b9ae-cf06c99db297',
    '7b875b4b-a6c5-4c92-a252-cd5ff203089e',
    '97b3d565-3c32-4fd5-be49-c16f0bae84e7',
    'ea08adf0-2383-41ae-a91a-88c7b8f6f42b',
    '5b8c745a-972b-455c-8021-ee24fdbce9a5',
    'bebf0200-8458-4467-b001-ff436564e942',
    '1c16f983-c090-457a-aca7-4181d16e225b',
    'b259ac6c-3358-4faa-abfe-c9d614b76915',
    '1a119cfe-3178-4f06-800b-b2aec50218b8',
    '33c73ae8-f829-438d-bdb1-da0be8f3773f',
    '3d6afb8e-dbcd-4972-8281-ae546b23356c',
    '42fd7b4a-461d-4a4f-bb02-856e7124dce1',
    '08f28ada-3fa1-41f3-a7eb-5b4ff8325145',
    '189a0802-8538-41f8-ad51-8bb2a736783b',
    'e0dc36c3-ff48-4ab5-881f-899578e08dd4',
    '9052b5fc-8ac8-41ea-8a82-6860b8d2c33d',
    'b8bc131f-68d6-4c56-bd37-55c1b0e27d2e',
]
snapshot_id_list = ['737d454c-88be-477f-ae2c-ef473e2106ce']

#############################################
## Execution
#############################################

results = link_duos_ids_to_snapshots(snapshot_id_list, env, token)
df_results = pd.DataFrame(results, columns = ["Item", "Status", "Message"])
print("\nLinking Results:")
display(df_results)


## Add DUOS IDs Directly

In [None]:
#############################################
## Functions
#############################################

def direct_link_duos_ids_to_snapshots(snapshot_duos_list):
    results_log = []

    # Loop through input snapshots and link DUOS IDs to them
    print("Linking DUOS IDs to Snapshots...")
    creds, project = google.auth.default()
    auth_req = google.auth.transport.requests.Request()
    creds.refresh(auth_req)
    api_client = refresh_tdr_api_client()
    snapshots_api = data_repo_client.SnapshotsApi(api_client=api_client)
    duos_api = data_repo_client.DuosApi(api_client=api_client)
    for ss_duos_entry in snapshot_duos_list:
        snapshot_id = ss_duos_entry[0]
        duos_id = ss_duos_entry[1]
        print(f"\tProcessing snapshot ID = {snapshot_id}")
        if duos_id:
            # Link the DUOS ID to the snapshot
            print(f"\t\t- Linking DUOS ID {duos_id} to snapshot.")
            attempt_counter = 0
            while attempt_counter <= 2:
                try:
                    response = snapshots_api.link_duos_dataset_to_snapshot(id=snapshot_id, duos_id=duos_id).to_dict()
                    if response.get("linked"):
                        results_log.append([f"DUOS ID to Snapshot Linkage ({snapshot_id} - {duos_id})", "Success", ""])
                        break
                    elif response.get("message"):
                        response_message = response.get("message")
                        msg = f"Error linking DUOS ID to Snapshot: {response_message}"
                        if attempt_counter >= 2:
                            results_log.append([f"DUOS ID to Snapshot Linkage ({snapshot_id} - {duos_id})", "Failed", msg])
                            break
                except Exception as e:
                    msg = f"Error linking DUOS ID to Snapshot: {str(e)}"
                    if attempt_counter >= 2:
                        results_log.append([f"DUOS ID to Snapshot Linkage ({snapshot_id} - {duos_id})", "Failed", msg])
                    sleep(5)
                    attempt_counter += 1  
            
            # Fetch the DUOS user group associated with the DUOS ID
            print(f"\t\t- Fetching DUOS user group from DUOS ID {duos_id}.")
            duos_group = ""
            attempt_counter = 0
            while attempt_counter <= 2:
                try:  
                    response = duos_api.retrieve_duos_firecloud_group(duos_id=duos_id).to_dict()
                    duos_group = response["firecloud_group_email"]
                    results_log.append([f"DUOS User Group Fetching ({duos_id})", "Success", ""])
                    break
                except Exception as e:
                    msg = f"Error fetching DUOS user group for DUOS ID {duos_id}: {str(e)}"
                    if attempt_counter >= 2:
                        results_log.append([f"DUOS User Group Fetching ({duos_id})", "Failed", msg])
                    sleep(5)
                    attempt_counter += 1 
            
            # Add the DUOS user group to any DAC groups on the snapshot
            print(f"\t\t- Adding DUOS user group {duos_group} to snapshot DAC user group(s).")
            dac_groups = []
            try:
                response = snapshots_api.retrieve_snapshot_policies(id=snapshot_id).to_dict()
                if response.get("auth_domain"):
                    dac_groups = response["auth_domain"]
                if dac_groups:
                    print(f"\t\t\t- DAC user group(s) found on snapshot: {dac_groups}.")
                    for dac_group in dac_groups:
                        response = requests.put(
                            url=f"https://api.firecloud.org/api/groups/{dac_group}/member/{duos_group}",
                            headers={"Authorization": f"Bearer {creds.token}"}
                        )
                        if response.status_code != 204:
                            results_log.append([f"DUOS Group to DAC Group Addition ({duos_group} - {dac_group})", "Failed", "Error adding DUOS group to DAC group."])
                        else:
                            results_log.append([f"DUOS Group to DAC Group Addition ({duos_group} - {dac_group})", "Success", ""])
                else:
                    msg = f"No DAC user group(s) found on snapshot."
                    print(f"\t\t\t- {msg}")
                    results_log.append([f"DUOS Group to DAC Group Addition ({snapshot_id} - {duos_id})", "Warning", msg])   
            except Exception as e:
                msg = f"Error adding DUOS Group to DAC Group: {str(e)}"
                results_log.append([f"DUOS Group to DAC Group Addition ({snapshot_id} - {duos_id})", "Failed", msg])
        
        else:
            results_log.append([f"DUOS ID to Snapshot Linkage ({snapshot_id})", "Failed", "No DUOS ID found for the snapshot."])
    return results_log

#############################################
## Input Parameters
#############################################

# Snapshot list
snapshot_duos_list = [
    #['snapshot_id', 'duos_id']
    ['737d454c-88be-477f-ae2c-ef473e2106ce', 'DUOS-000254'],
]

#############################################
## Execution
#############################################

results = direct_link_duos_ids_to_snapshots(snapshot_duos_list)
df_results = pd.DataFrame(results, columns = ["Item", "Status", "Message"])
print("\nLinking Results:")
display(df_results)

# Script Development

## Fetch parameters from snapshot/dataset

In [None]:
# Parameters
snapshot_id = "099d2585-1379-4333-b3b1-ffc0d26d95c5"

# Retrieve snapshot details
api_client = refresh_tdr_api_client()
datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
snapshots_api = data_repo_client.SnapshotsApi(api_client=api_client)
snapshot_details = snapshots_api.retrieve_snapshot(id=snapshot_id).to_dict()
dataset_id = snapshot_details["source"][0]["dataset"]["id"]
phs_id = snapshot_details["source"][0]["dataset"]["phs_id"]

# Retrieve dataset details
dataset_details = datasets_api.retrieve_dataset(id=dataset_id, include=["PROPERTIES"]).to_dict()
if dataset_details["properties"].get("auth_domains"):
    auth_domain = dataset_details["properties"]["auth_domains"][0]
if dataset_details["properties"].get("source_workspaces"):
    source_workspace = dataset_details["properties"]["source_workspaces"][0]

# Print output
print(phs_id)
print(source_workspace)

## Pulling Workspace Attributes

In [None]:
# Parameters
ws_project = "anvil-datastorage"
ws_name = "AnVIL_GREGOR_RELEASE_01_HMB"

# Establish credentials
creds, project = google.auth.default()
auth_req = google.auth.transport.requests.Request()
creds.refresh(auth_req)

# Pull workspace attributes
ws_attributes = requests.get(
    url=f"https://api.firecloud.org/api/workspaces/{ws_project}/{ws_name}?fields=workspace.attributes,workspace.authorizationDomain,workspace.googleProject,workspace.bucketName",
    headers={"Authorization": f"Bearer {creds.token}"}
).json()

# Map to schema
terra_dict = {}
terra_dict["studyName"] = ws_attributes["workspace"]["attributes"].get("library:projectName")
terra_dict["studyType"] = ws_attributes["workspace"]["attributes"].get("library:studyDesign")
#terra_dict["studyDescription"] = ws_attributes["workspace"]["attributes"].get("description")
terra_dict["dataTypes"] = ws_attributes["workspace"]["attributes"].get("library:dataCategory")["items"]
terra_dict["phenotypeIndication"] = ws_attributes["workspace"]["attributes"].get("library:indication")
terra_dict["species"] = "Homo sapiens"
terra_dict["piName"] = ws_attributes["workspace"]["attributes"].get("library:datasetOwner")
terra_dict["dataCustodianEmail"] = ws_attributes["workspace"]["attributes"].get("library:contactEmail")
if ws_attributes["workspace"]["attributes"].get("tag:tags"):
    for tag in ws_attributes["workspace"]["attributes"].get("tag:tags")["items"]:
        if "Consortium:" in tag:
            terra_dict["consortium"] = tag.split(":")[1].strip()
        elif "dbGaP:" in tag:
            terra_dict["dbGaPPhsID"] = tag.split(":")[1].strip()
terra_dict["consentGroups.consentCode"] = ws_attributes["workspace"]["attributes"]["library:dataUseRestriction"] 
terra_dict["consentGroups.fileTypes.fileType"] = ws_attributes["workspace"]["attributes"]["library:datatype"]["items"]

# View schema
print(terra_dict)


In [None]:
ws_attributes

In [None]:
ws_attributes

## dbGaP XML Parse

In [None]:
# Parameters
phs_id = "phs003047"
#phs_id = "phs000693"

# Pull and parse XML
phs_short = phs_id.replace("phs", "")
dbgap_url = "https://dbgap.ncbi.nlm.nih.gov/ss/dbgapssws.cgi?request=Study&phs=" + phs_short
response = requests.get(url=dbgap_url)
xml_data = xmltodict.parse(response.text)

# Map to schema
dbgap_xml_dict = {}
if isinstance(xml_data["dbgapss"]["Study"], list):
    study_data = xml_data["dbgapss"]["Study"][0]
else:
    study_data = xml_data["dbgapss"]["Study"] 
dbgap_xml_dict["studyName"] = study_data["StudyInfo"].get("StudyNameEntrez")
dbgap_xml_dict["studyDescription"] = study_data["StudyInfo"].get("Description")
dbgap_xml_dict["dbGaPPhsID"] = phs_id
dbgap_xml_dict["dbGaPStudyRegistrationName"] = study_data["StudyInfo"].get("StudyNameEntrez")
for ap_entry in study_data["Authority"]["Persons"]["Person"]:
    if ap_entry["Role"] == "PI":
        dbgap_xml_dict["piName"] = ap_entry["@lname"] + ", " + ap_entry["@fname"]
        dbgap_xml_dict["piEmail"] = ap_entry["@email"]
        dbgap_xml_dict["piInstitution"] = ap_entry["Organization"]
    elif ap_entry["Role"] == "PO" and ap_entry["Organization"] == "NIH":
        dbgap_xml_dict["nihProgramOfficerName"] = ap_entry["@lname"] + ", " + ap_entry["@fname"]
    elif ap_entry["Role"] == "GPA" and ap_entry["Organization"] == "NIH":
        dbgap_xml_dict["nihGenomicProgramAdministratorName"] = ap_entry["@lname"] + ", " + ap_entry["@fname"]
ic_list = []
if isinstance(study_data["Authority"]["ICs"]["IC"], list):
    for ic_entry in study_data["Authority"]["ICs"]["IC"]:
        ic_list.append(ic_entry["@name"])
else:
    ic_list.append(study_data["Authority"]["ICs"]["IC"]["@name"])
dbgap_xml_dict["nihICsSupportingStudy"] = ic_list
dbgap_xml_dict["numberOfParticipants"] = study_data.get("@num_participants")
dbgap_xml_dict["embargoReleaseDate"] = study_data["Policy"].get("@pub-embargo")

# View schema
print(dbgap_xml_dict)


In [None]:
study_data

In [None]:
study_data

## dbGaP Study API

In [None]:
# Parameters
study_uid = 483191234

# Pull and parse JSON
dbgap_study_url = "https://submit.ncbi.nlm.nih.gov/dbgap/api/v1/study_config/" + str(study_uid)
response = requests.get(url=dbgap_study_url)
study_api_data = json.loads(response.text)

# Map to schema
dbgap_study_api_dict = {}
if study_api_data.get("error") == None:
    dbgap_study_api_dict["studyName"] = study_api_data["data"].get("report_name")
    dbgap_study_api_dict["studyDescription"] = study_api_data["data"].get("description")
    dbgap_study_api_dict["phenotypeIndication"] = study_api_data["data"].get("primary_disease")
    dbgap_study_api_dict["studyType"] = study_api_data["data"].get("study_design")
    for attr_entry in study_api_data["data"].get("attribution"):
        if attr_entry.get("title") == "Principal Investigator":
            dbgap_study_api_dict["piName"] = attr_entry.get("name")
            dbgap_study_api_dict["piInstitution"] = attr_entry.get("institute")
            break

# View schema
print(dbgap_study_api_dict)

In [None]:
study_api_data

## dbGaP FHIR API

In [None]:
# Parameters
#phs_id = "phs003047"
phs_id = "phs000693"

# Pull and parse JSON
dbgap_fhir_url = "https://dbgap-api.ncbi.nlm.nih.gov/fhir/x1/ResearchStudy?_format=json&_id=" + phs_id
response = requests.get(url=dbgap_fhir_url)
fhir_data = json.loads(response.text)

# Map to schema
dbgap_fhir_dict = {}
dbgap_fhir_dict["studyName"] = fhir_data["entry"][0]["resource"].get("title")
dbgap_fhir_dict["studyDescription"] = fhir_data["entry"][0]["resource"].get("description")
dbgap_fhir_dict["dbGaPPhsID"] = phs_id
dbgap_fhir_dict["dbGaPStudyRegistrationName"] = fhir_data["entry"][0]["resource"].get("title")
dbgap_fhir_dict["nihICsSupportingStudy"] = fhir_data["entry"][0]["resource"]["sponsor"].get("display")
# studyType
for cat_entry in fhir_data["entry"][0]["resource"].get("category"):
    for coding_entry in cat_entry.get("coding"):
        if coding_entry.get("system") == "https://dbgap-api.ncbi.nlm.nih.gov/fhir/x1/CodeSystem/ResearchStudy-StudyDesign":
            value = coding_entry.get("display") if coding_entry.get("display") else coding_entry.get("code")
            if dbgap_fhir_dict.get("studyType") and value:
                dbgap_fhir_dict["studyType"] += f", {value}"
            elif value:
                dbgap_fhir_dict["studyType"] = value
# dataTypes
dt_list = []
for ext_entry in fhir_data["entry"][0]["resource"].get("extension"):
    if ext_entry.get("url") == "https://dbgap-api.ncbi.nlm.nih.gov/fhir/x1/StructureDefinition/ResearchStudy-MolecularDataTypes":
        for inner_ext_entry in ext_entry.get("extension"):
            if inner_ext_entry.get("url") == "https://dbgap-api.ncbi.nlm.nih.gov/fhir/x1/StructureDefinition/ResearchStudy-MolecularDataTypes-MolecularDataType":
                for coding_entry in inner_ext_entry["valueCodeableConcept"].get("coding"):
                    dt_list.append(coding_entry.get("code"))
dbgap_fhir_dict["dataTypes"] = dt_list
# phenotypeIndication
for focus_entry in fhir_data["entry"][0]["resource"].get("focus"):
    for coding_entry in focus_entry.get("coding"):
        value = coding_entry.get("display") if coding_entry.get("display") else coding_entry.get("code")
        if dbgap_fhir_dict.get("phenotypeIndication") and value:
            dbgap_fhir_dict["phenotypeIndication"] += f", {value}"
        elif value:
            dbgap_fhir_dict["phenotypeIndication"] = value
# numberOfParticipants
for ext_entry in fhir_data["entry"][0]["resource"].get("extension"):
    if ext_entry.get("url") == "https://dbgap-api.ncbi.nlm.nih.gov/fhir/x1/StructureDefinition/ResearchStudy-Content":
        for inner_ext_entry in ext_entry.get("extension"):
            if inner_ext_entry.get("url") == "https://dbgap-api.ncbi.nlm.nih.gov/fhir/x1/StructureDefinition/ResearchStudy-Content-NumSubjects":
                dbgap_fhir_dict["numberOfParticipants"] = inner_ext_entry["valueCount"].get("code")

# View schema
print(dbgap_fhir_dict)

In [None]:
fhir_data

# Utilities

## Delete Studies from DUOS (Dev Only)

In [None]:
# Inputs
token = ""
study_id_list = [
    '5918',
    '5919',
    '5920',
    '5921',
    '5922',
    '5923',
    '5924',
    '5925',
    '5926',
    '5927',
    '5928',
    '5929',
    '5930',
    '5931',
    '5932',
    '5933',
    '5934',
    '5935',
    '5936',
    '5937',
    '5938',
    '5939',
    '5940',
    '5941',
    '5942',
    '5943',
    '5944',
    '5945',
    '5946',
    '5947',
    '5948',
    '5949',
    '5950',
    '5951',
    '5952',
    '5953',
    '5954',
    '5955',
    '5956',
    '5957',
    '5958',
    '5959',
    '5960',
    '5961',
    '5962',
    '5963',
    '5964',
    '5965',
    '5966',
    '5967',
    '5968',
    '5969',
    '6034',
    '6035',
    '6036',
    '6037',
    '6039',
    '6040',
    '6041',
    '6042',
    '6043',
    '6044',
    '6045',
    '6046',
    '6047',
    '6048',
    '6049',
    '6050',
    '6051',
    '6052',
    '6053',
    '6054',
    '6055',
    '6056',
    '6057',
    '6058',
    '6059',
    '6060',
    '6061',
    '6062',
    '6063',
    '6064',
    '6065',
    '6066',
    '6068',
    '6069',
    '6070',
    '6071',
    '6072',
    '6073',
    '6074',
]

# Delete studies
for study_id in study_id_list:
    print(f"Deleting study ID {study_id}")
    response = requests.delete(
        url=f"https://consent.dsde-dev.broadinstitute.org/api/dataset/study/{study_id}",
        headers={"Authorization": f"Bearer {token}"} 
    )
    if response.status_code == 200:
        print("Study deleted successfully.")
    else:
        msg = response.json()["message"]
        print(f"Error deleting study: {msg}")
    

## Delete Datasets from DUOS (Dev Only)

In [None]:
# Collect datasets
for key,val in dataset_lookup.items():
    if val["create_date"] == "Jul 25, 2024" or val["create_date"] == "Jul 24, 2024":
        print(val["id"])

In [None]:
# Inputs
token = ""
dataset_id_list = [ 
    '2153',
    '2154',
    '2155',
    '2156',
    '2157',
    '2158',
    '2159',
    '2160',
    '2161',
    '2162',
    '2163',
    '2164',
    '2165',
    '2166',
    '2167',
    '2168',
    '2169',
    '2170',
    '2171',
    '2172',
    '2173',
    '2174',
    '2175',
    '2176',
    '2177',
    '2178',
    '2179',
    '2180',
    '2181',
    '2182',
    '2183',
    '2184',
    '2185',
    '2186',
    '2187',
    '2188',
    '2189',
    '2190',
    '2191',
    '2192',
    '2193',
    '2194',
    '2195',
    '2196',
    '2197',
    '2198',
    '2199',
    '2200',
    '2201',
    '2202',
    '2203',
    '2204',
    '2205',
    '2206',
    '2207',
    '2208',
    '2209',
    '2210',
    '2211',
    '2212',
    '2213',
    '2214',
    '2215',
    '2216',
    '2217',
    '2218',
    '2219',
    '2220',
    '2221',
    '2222',
    '2223',
    '2224',
    '2225',
    '2226',
    '2227',
    '2228',
    '2229',
    '2230',
    '2231',
    '2232',
    '2233',
    '2234',
    '2235',
    '2236',
    '2237',
    '2238',
    '2239',
    '2240',
    '2241',
    '2242',
    '2243',
    '2244',
    '2245',
    '2246',
    '2247',
    '2248',
    '2249',
    '2250',
    '2251',
    '2252',
    '2253',
    '2254',
    '2255',
    '2256',
    '2257',
    '2258',
    '2259',
    '2260',
    '2261',
    '2262',
    '2263',
    '2264',
    '2265',
    '2266',
    '2267',
    '2268',
    '2269',
    '2270',
    '2271',
    '2272',
    '2273',
    '2274',
    '2275',
    '2276',
    '2277',
    '2278',
    '2279',
    '2280',
    '2281',
    '2282',
    '2283',
    '2284',
    '2285',
    '2286',
    '2287',
    '2288',
    '2289',
    '2290',
    '2291',
    '2292',
    '2293',
    '2294',
    '2295',
    '2296',
    '2297',
    '2298',
    '2299',
    '2300',
    '2301',
    '2302',
    '2370',
    '2371',
    '2372',
    '2373',
    '2374',
    '2375',
    '2376',
    '2377',
    '2378',
    '2379',
    '2380',
    '2381',
    '2382',
    '2383',
    '2384',
    '2385',
    '2386',
    '2387',
    '2388',
    '2389',
    '2390',
    '2391',
    '2392',
    '2393',
    '2394',
    '2395',
    '2396',
    '2397',
    '2398',
    '2399',
    '2400',
    '2401',
    '2402',
    '2403',
    '2404',
    '2405',
    '2406',
    '2407',
    '2408',
    '2409',
    '2410',
    '2411',
    '2412',
    '2413',
    '2414',
    '2415',
    '2416',
    '2417',
    '2418',
    '2419',
    '2420',
    '2421',
    '2422',
    '2423',
    '2424',
    '2425',
    '2426',
    '2427',
    '2428',
    '2429',
    '2430',
    '2431',
    '2432',
    '2433',
    '2434',
    '2435',
    '2436',
    '2437',
    '2438',
    '2439',
    '2440',
    '2441',
    '2442',
    '2443',
    '2444',
    '2445',
    '2446',
    '2447',
    '2448',
    '2449',
    '2450',
    '2451',
    '2452',
    '2453',
    '2454',
    '2455',
    '2456',
    '2457',
    '2458',
    '2459',
    '2460',
    '2461',
    '2462',
    '2463',
    '2464',
    '2465',
    '2466',
    '2467',
    '2468',
    '2469',
    '2470',
    '2471',
    '2472',
    '2473',
    '2474',
    '2475',
    '2476',
    '2477',
    '2478',
    '2479',
    '2480',
    '2481',
    '2482',
    '2483',
    '2484',
    '2485',
    '2486',
    '2487',
    '2488',
    '2489',
    '2490',
    '2491',
    '2492',
    '2493',
    '2494',
    '2495',
    '2496',
    '2497',
    '2498',
    '2499',
    '2500',
    '2501',
    '2502',
    '2503',
    '2504',
    '2505',
    '2506',
    '2507',
    '2508',
    '2509',
    '2510',
    '2511',
    '2512',
    '2513',
    '2514',
    '2515',
    '2516',
    '2517',
    '2518',
    '2519',
    '2520',
    '2521',
    '2522',
    '2523',
    '2524',
    '2525',
    '2526',
    '2527',
    '2528',
    '2529',
    '2530',
    '2531',
    '2532',
    '2533',
    '2534',
    '2535',
    '2536',
    '2537',
    '2538',
    '2539',
    '2540',
    '2541',
    '2542',
    '2543',
    '2544',
    '2545',
    '2546',
    '2547',
    '2548',
    '2549',
    '2550',
    '2551',
    '2552',
    '2553',
    '2554',
    '2555',
    '2556',
    '2557',
    '2558',
    '2559',
    '2560',
    '2561',
    '2562',
    '2563',
    '2564',
    '2568',
    '2569',
    '2570',
    '2571',
    '2572',
    '2573',
    '2574',
    '2575',
    '2576',
    '2577',
    '2578',
    '2579',
    '2580',
]

# Delete datasets
for dataset_id in dataset_id_list:
    print(f"Deleting dataset ID {dataset_id}")
    response = requests.delete(
        url=f"https://consent.dsde-dev.broadinstitute.org/api/dataset/index/{dataset_id}",
        headers={"Authorization": f"Bearer {token}"} 
    )
    if response.status_code == 200:
        print("Dataset deleted successfully.")
    else:
        msg = response.json()["message"]
        print(f"Error deleting dataset: {msg}")

## Collect AnVIL Studies and Datasets DUOS

In [None]:
# Inputs
token = ""
env = "prod"

# Determine the target URL from the env variable
if env == "prod":
    url = "https://consent.dsde-prod.broadinstitute.org"
else:
    url = "https://consent.dsde-dev.broadinstitute.org"

# Pull a list of existing AnVIL studies and datasets from DUOS
studies_processed = set()
results = []
datasets = requests.get(
    url=f"{url}/api/dataset/v2?asCustodian=true",
    headers={"Authorization": f"Bearer {token}"}
).json()
for dataset_entry in datasets:
    if dataset_entry.get("study") and dataset_entry["study"]["studyId"] not in studies_processed:
        study_id = dataset_entry["study"]["studyId"]
        if dataset_entry["study"].get("description") and "Platform: AnVIL" in dataset_entry["study"]["description"]: 
            study_name = dataset_entry["study"]["name"]
            study_phs = ""
            for prop_entry in dataset_entry["study"]["properties"]:
                if prop_entry["key"] == "dbGaPPhsID":
                    study_phs = prop_entry["value"]
                    break
            for dataset_id in dataset_entry["study"]["datasetIds"]:
                dataset_details = requests.get(
                    url=f"{url}/api/dataset/v2/{dataset_id}",
                    headers={"Authorization": f"Bearer {token}"}
                ).json()
                dataset_name = dataset_details["name"]
                dataset_identifier = dataset_details["datasetIdentifier"]
                snapshot_id = ""
                for prop_entry in dataset_entry["properties"]:
                    if prop_entry["propertyName"] == "URL":
                        snapshot_url = prop_entry["propertyValue"]
                        if snapshot_url:
                            if "https://data.terra.bio/snapshots/" in snapshot_url:
                                snapshot_id = snapshot_url.replace("https://data.terra.bio/snapshots/", "")
                        
                results.append([study_id, study_name, study_phs, dataset_id, dataset_identifier, dataset_name, snapshot_id])
        studies_processed.add(study_id)

# Display results
df_results = pd.DataFrame(results, columns = ["Study ID", "Study Name", "Study PHS", "Dataset ID", "Dataset Identifier", "Dataset Name", "Snapshot ID"])
print("\nResults:")
display(df_results)

In [None]:
datasets = requests.get(
    url=f"{url}/api/dataset/v2?asCustodian=true",
    headers={"Authorization": f"Bearer {token}"}
)

In [None]:
datasets

In [None]:
# Inputs
token = ""
env = "dev"

# Determine the target URL from the env variable
if env == "prod":
    url = "https://consent.dsde-prod.broadinstitute.org"
else:
    url = "https://consent.dsde-dev.broadinstitute.org"

# Pull a list of existing AnVIL studies and datasets from DUOS
results = []
datasets = requests.get(
    url=f"{url}/api/dataset/v3",
    headers={"Authorization": f"Bearer {token}"}
).json()
datasets_to_process = len(datasets)
datasets_processed = 0
for dataset_entry in datasets:
    datasets_processed += 1
    print(f"Processing dataset {datasets_processed} of {datasets_to_process}...")
    dataset_id = dataset_entry["dataset_id"]
    dataset_details = requests.get(
        url=f"{url}/api/dataset/v2/{dataset_id}",
        headers={"Authorization": f"Bearer {token}"}
    ).json() 
    if dataset_details.get("study"):
        study_id = dataset_details["study"]["studyId"]
        if dataset_details["study"].get("description") and "Platform: AnVIL" in dataset_details["study"]["description"]: 
            study_name = dataset_details["study"]["name"]
            study_phs = ""
            for prop_entry in dataset_details["study"]["properties"]:
                if prop_entry["key"] == "dbGaPPhsID":
                    study_phs = prop_entry["value"]
                    break
            dataset_name = dataset_details["name"]
            dataset_identifier = dataset_details["datasetIdentifier"]
            snapshot_id = ""
            for prop_entry in dataset_details["properties"]:
                if prop_entry["propertyName"] == "URL":
                    snapshot_url = prop_entry["propertyValue"]
                    if snapshot_url and "https://data.terra.bio/snapshots/" in snapshot_url:
                            snapshot_id = snapshot_url.replace("https://data.terra.bio/snapshots/", "")       
            results.append([study_id, study_name, study_phs, dataset_id, dataset_identifier, dataset_name, snapshot_id])

# Display results
df_results = pd.DataFrame(results, columns = ["Study ID", "Study Name", "Study PHS", "Dataset ID", "Dataset Identifier", "Dataset Name", "Snapshot ID"])
print("\nResults:")
display(df_results)

In [None]:
study_list

In [None]:
dataset_lookup