# Imports

In [None]:
#!pip install --upgrade data_repo_client
#!pip install --upgrade xmltodict

In [97]:
import requests
import json
import google.auth
import xmltodict
import data_repo_client
import pandas as pd
import re
from time import sleep
import ast

# Function to refresh TDR API client
def refresh_tdr_api_client():
    creds, project = google.auth.default()
    auth_req = google.auth.transport.requests.Request()
    creds.refresh(auth_req)
    config = data_repo_client.Configuration()
    config.host = "https://data.terra.bio"
    config.access_token = creds.token
    api_client = data_repo_client.ApiClient(configuration=config)
    api_client.client_side_validation = False
    return api_client

# Display options
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option("display.max_colwidth", None)
pd.set_option('display.width', 1000)
pd.set_option('display.colheader_justify', 'center')
pd.set_option('display.precision', 3)

# Step 1: Collect Metadata for Review

In [88]:
#############################################
## Functions
#############################################

def coalesce(*arg): 
    remove_list = ["", "NA", "N/A", "NONE", "TBD", "UNKNOWN", "UNSPECIFIED"]
    # update to remove N/A, None, Null, TBD
    for input_item in arg:
        if input_item is not None:
            if isinstance(input_item, list):
                temp_list = [ele for ele in input_item if ele is not None and ele.upper() not in remove_list]
                if temp_list:
                    return temp_list
                else:
                    return []
            else:
                if str(input_item).upper() not in remove_list:
                    return input_item
    return None

def format_description(input_string):
    output_string = input_string if input_string else ""
    output_string = re.sub("\n\n\t", " ", output_string)
    output_string = re.sub("\t", " ", output_string)
    output_string = re.sub("study.cgi\?study_id=|.\/study.cgi\?study_id=", "https://www.ncbi.nlm.nih.gov/projects/gap/cgi-bin/study.cgi?study_id=", output_string)
    return output_string

def format_phs_id(input_str):
    try:
        num = re.search("phs0*([0-9]+)", input_str, re.IGNORECASE).group(1)
    except:
        num = ""
    if num:
        output_str = "phs" + str(num).zfill(6)
    else:
        output_str = ""
    return output_str

def try_join(l):
    try:
        if isinstance(l, list):
            return ', '.join(map(str, l))
        else:
            return l
    except TypeError:
        return l
    
def val_study_type_enum(l):
    if l and l not in ["Observational", "Interventional", "Descriptive", "Analytical", "Prospective", "Retrospective", "Case report", "Case series", "Cross-sectional", "Cohort study"]:
        return 1
    else:
        return 0

def val_nih_inst_center_sub_enum(l):
    if l and l not in ["NCI", "NEI", "NHLBI", "NHGRI", "NIA", "NIAAA", "NIAID", "NIAMS", "NIBIB", "NICHD", "NIDCD", "NIDCR", "NIDDK", "NIDA", "NIEHS", "NIGMS", "NIMH", "NIMHD", "NINDS", "NINR", "NLM", "CC", "CIT", "CSR", "FIC", "NCATS", "NCCIH"]:
        return 1
    else:
        return 0

def val_nih_ic_supp_study_enum(l):
    if l and isinstance(l, list):
        for item in l:
            if item not in ["NCI", "NEI", "NHLBI", "NHGRI", "NIA", "NIAAA", "NIAID", "NIAMS", "NIBIB", "NICHD", "NIDCD", "NIDCR", "NIDDK", "NIDA", "NIEHS", "NIGMS", "NIMH", "NIMHD", "NINDS", "NINR", "NLM", "CC", "CIT", "CSR", "FIC", "NCATS", "NCCIH"]:
                return 1
        return 0
    else:
        return 0

def val_file_type_enum(l):
    if l and isinstance(l, list):
        for item in l:
            if item not in ["Arrays", "Genome", "Exome", "Survey", "Phenotype"]:
                return 1
        return 0
    else:
        return 0

def fetch_dataset_details(snapshot_id, ds_consent_map):
    
    # Initialize variables
    terra_dict = {}
    dbgap_xml_dict = {}
    dbgap_study_api_dict = {}
    dbgap_fhir_dict = {}
    final_results_dict = {}
    
    # Retrieve snapshot details
    api_client = refresh_tdr_api_client()
    datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
    snapshots_api = data_repo_client.SnapshotsApi(api_client=api_client)
    attempt_counter = 0
    while attempt_counter <= 2:
        try:
            snapshot_details = snapshots_api.retrieve_snapshot(id=snapshot_id).to_dict()
            break
        except:
            sleep(5)
            attempt_counter += 1  
    snapshot_name = snapshot_details["name"]
    dataset_id = snapshot_details["source"][0]["dataset"]["id"]
    phs_id = format_phs_id(snapshot_details["source"][0]["dataset"]["phs_id"])
    if snapshot_details["source"][0]["dataset"]["secure_monitoring_enabled"] == True:
        access_management = "controlled"
    else:
        access_management = "open"
    if snapshot_details["source"][0]["dataset_properties"].get("source_workspaces"):  
        source_workspace = snapshot_details["source"][0]["dataset_properties"]["source_workspaces"][0]
    else:
        source_workspace = None
    print("\tSnapshot PHS_ID: " + phs_id)
    print("\tSource Workspace: " + source_workspace)
    
    # Pull information from original workspace (if listed)
    if source_workspace:
        # Establish credentials
        creds, project = google.auth.default()
        auth_req = google.auth.transport.requests.Request()
        creds.refresh(auth_req)

        # Pull workspace attributes
        attempt_counter = 0
        while attempt_counter <= 2:
            try:
                ws_attributes = requests.get(
                    url=f"https://api.firecloud.org/api/workspaces/anvil-datastorage/{source_workspace}?fields=workspace.attributes,workspace.authorizationDomain,workspace.googleProject,workspace.bucketName",
                    headers={"Authorization": f"Bearer {creds.token}"}
                ).json()
                break
            except:
                sleep(5)
                attempt_counter += 1
        
        # Map to schema
        if ws_attributes.get("workspace"):
            terra_dict["studyName"] = coalesce(ws_attributes["workspace"]["attributes"].get("library:projectName"), source_workspace) 
            terra_dict["studyType"] = ws_attributes["workspace"]["attributes"].get("library:studyDesign")
            terra_dict["studyDescription"] = ws_attributes["workspace"]["attributes"].get("description")
            if ws_attributes["workspace"]["attributes"].get("library:dataCategory"):
                terra_dict["dataTypes"] = []
                for item in ws_attributes["workspace"]["attributes"]["library:dataCategory"]["items"]:
                    inner_list = item.split(",")
                    for inner_item in inner_list:
                        inner_item = inner_item.replace("'", "").strip()
                        terra_dict["dataTypes"].append(inner_item)
            terra_dict["phenotypeIndication"] = ws_attributes["workspace"]["attributes"].get("library:indication")
            terra_dict["species"] = "Homo sapiens"
            terra_dict["piName"] = ws_attributes["workspace"]["attributes"].get("library:datasetOwner")
            terra_dict["dataCustodianEmail"] = [ws_attributes["workspace"]["attributes"].get("library:contactEmail")]
            if ws_attributes["workspace"]["attributes"].get("tag:tags"):
                for tag in ws_attributes["workspace"]["attributes"].get("tag:tags")["items"]:
                    if "Consortium:" in tag:
                        terra_dict["consortium"] = tag.split(":")[1].strip()
                    elif "dbGaP:" in tag:
                        terra_dict["dbGaPPhsID"] = format_phs_id(tag.split(":")[1].strip())
                        if not phs_id:
                            phs_id = format_phs_id(tag.split(":")[1].strip()) 
            terra_dict["consentGroups.consentCode"] = ws_attributes["workspace"]["attributes"].get("library:dataUseRestriction")
            if ws_attributes["workspace"]["attributes"].get("library:datatype"):
                terra_dict["consentGroups.fileTypes.fileType"] = ws_attributes["workspace"]["attributes"]["library:datatype"]["items"]
            if ws_attributes["workspace"]["attributes"].get("library:numSubjects"):
                terra_dict["consentGroups.numberOfParticipants"] = ws_attributes["workspace"]["attributes"]["library:numSubjects"]
#         print("------------------------------------------------------")
#         print("terra_dict")
#         print(terra_dict)
        
    # Pull information from dbGaP (if phs_id listed)
#     print("PHS ID for dbGaP: " + phs_id)
    if phs_id:
        # Pull and parse XML
        phs_short = phs_id.replace("phs", "")
        dbgap_url = "https://dbgap.ncbi.nlm.nih.gov/ss/dbgapssws.cgi?request=Study&phs=" + phs_short
        attempt_counter = 0
        while attempt_counter <= 2:
            try:
                response = requests.get(url=dbgap_url)
                xml_data = xmltodict.parse(response.text)
                break
            except:
                sleep(5)
                attempt_counter += 1
        study_uid = ""

        # Map to schema
        if xml_data["dbgapss"].get("Study"):
            if isinstance(xml_data["dbgapss"]["Study"], list):
                study_data = xml_data["dbgapss"]["Study"][0]
            else:
                study_data = xml_data["dbgapss"]["Study"] 
            study_uid = study_data.get("@uid")
            dbgap_xml_dict["studyName"] = study_data["StudyInfo"].get("StudyNameEntrez")
            dbgap_xml_dict["studyDescription"] = study_data["StudyInfo"].get("Description")
            dbgap_xml_dict["dbGaPPhsID"] = phs_id
            dbgap_xml_dict["dbGaPStudyRegistrationName"] = study_data["StudyInfo"].get("StudyNameEntrez")
            if study_data["Authority"]["Persons"].get("Person"):
                for ap_entry in study_data["Authority"]["Persons"]["Person"]:
                    if ap_entry["Role"] == "PI":
                        dbgap_xml_dict["piName"] = ap_entry["@lname"] + ", " + ap_entry["@fname"]
                        dbgap_xml_dict["piEmail"] = ap_entry["@email"]
                        dbgap_xml_dict["piInstitution"] = ap_entry["Organization"]
                    elif ap_entry["Role"] == "PO" and ap_entry["Organization"] == "NIH":
                        dbgap_xml_dict["nihProgramOfficerName"] = ap_entry["@lname"] + ", " + ap_entry["@fname"]
                    elif ap_entry["Role"] == "GPA" and ap_entry["Organization"] == "NIH":
                        dbgap_xml_dict["nihGenomicProgramAdministratorName"] = ap_entry["@lname"] + ", " + ap_entry["@fname"]
            ic_list = []
            if isinstance(study_data["Authority"]["ICs"]["IC"], list):
                for ic_entry in study_data["Authority"]["ICs"]["IC"]:
                    ic_list.append(ic_entry["@name"])
            else:
                ic_list.append(study_data["Authority"]["ICs"]["IC"]["@name"])
            dbgap_xml_dict["nihICsSupportingStudy"] = ic_list
            dbgap_xml_dict["consentGroups.numberOfParticipants"] = study_data.get("@num_participants")
            dbgap_xml_dict["embargoReleaseDate"] = study_data["Policy"].get("@pub-embargo")
#             print("------------------------------------------------------")
#             print("dbgap_xml_dict")
#             print(dbgap_xml_dict)
        
        # Pull and parse Study API JSON
        if study_uid:
            dbgap_study_url = "https://submit.ncbi.nlm.nih.gov/dbgap/api/v1/study_config/" + str(study_uid)
            attempt_counter = 0
            while attempt_counter <= 2:
                try:
                    response = requests.get(url=dbgap_study_url)
                    study_api_data = json.loads(response.text)
                    break
                except:
                    sleep(5)
                    attempt_counter += 1
            
            # Map to schema
            if study_api_data.get("error") == None:
                dbgap_study_api_dict["studyName"] = study_api_data["data"].get("report_name")
                dbgap_study_api_dict["studyDescription"] = study_api_data["data"].get("description")
                dbgap_study_api_dict["phenotypeIndication"] = study_api_data["data"].get("primary_disease")
                dbgap_study_api_dict["studyType"] = study_api_data["data"].get("study_design")
                dbgap_study_api_dict["dbGaPPhsID"] = phs_id
                dbgap_study_api_dict["dbGaPStudyRegistrationName"] = study_api_data["data"].get("report_name")
                for attr_entry in study_api_data["data"].get("attribution"):
                    if attr_entry.get("title") == "Principal Investigator":
                        dbgap_study_api_dict["piName"] = attr_entry.get("name")
                        dbgap_study_api_dict["piInstitution"] = attr_entry.get("institute")
                        break
#             print("------------------------------------------------------")
#             print("dbgap_study_api_dict")
#             print(dbgap_study_api_dict)
        
        # Pull and parse FHIR API JSON
        dbgap_fhir_url = "https://dbgap-api.ncbi.nlm.nih.gov/fhir/x1/ResearchStudy?_format=json&_id=" + phs_id
        attempt_counter = 0
        while attempt_counter <= 2:
            try:
                response = requests.get(url=dbgap_fhir_url)
                fhir_data = json.loads(response.text)
                break
            except:
                sleep(5)
                attempt_counter += 1

        # Map to schema
        if fhir_data.get("entry"):
            dbgap_fhir_dict["studyName"] = fhir_data["entry"][0]["resource"].get("title")
            dbgap_fhir_dict["studyDescription"] = fhir_data["entry"][0]["resource"].get("description")
            dbgap_fhir_dict["dbGaPPhsID"] = phs_id
            dbgap_fhir_dict["dbGaPStudyRegistrationName"] = fhir_data["entry"][0]["resource"].get("title")
            # NIH ICs
            if "Organization/" in fhir_data["entry"][0]["resource"]["sponsor"].get("reference"):
                dbgap_fhir_dict["nihICsSupportingStudy"] = [fhir_data["entry"][0]["resource"]["sponsor"].get("reference")[13:]]
            else:
                ic_display = fhir_data["entry"][0]["resource"]["sponsor"].get("display")
                if ic_display == "National Human Genome Research Institute":
                    dbgap_fhir_dict["nihICsSupportingStudy"] = ["NHGRI"]
                else:
                    dbgap_fhir_dict["nihICsSupportingStudy"] = [ic_display]
            # studyType
            if fhir_data["entry"][0]["resource"].get("category"):
                for cat_entry in fhir_data["entry"][0]["resource"].get("category"):
                    if cat_entry.get("coding"):
                        for coding_entry in cat_entry.get("coding"):
                            if coding_entry.get("system") == "https://dbgap-api.ncbi.nlm.nih.gov/fhir/x1/CodeSystem/ResearchStudy-StudyDesign":
                                value = coding_entry.get("display") if coding_entry.get("display") else coding_entry.get("code")
                                if dbgap_fhir_dict.get("studyType") and value:
                                    dbgap_fhir_dict["studyType"] += f", {value}"
                                elif value:
                                    dbgap_fhir_dict["studyType"] = value
            # dataTypes
            dt_list = []
            if fhir_data["entry"][0]["resource"].get("extension"): 
                for ext_entry in fhir_data["entry"][0]["resource"].get("extension"):
                    if ext_entry.get("url") == "https://dbgap-api.ncbi.nlm.nih.gov/fhir/x1/StructureDefinition/ResearchStudy-MolecularDataTypes":
                        for inner_ext_entry in ext_entry.get("extension"):
                            if inner_ext_entry.get("url") == "https://dbgap-api.ncbi.nlm.nih.gov/fhir/x1/StructureDefinition/ResearchStudy-MolecularDataTypes-MolecularDataType":
                                for coding_entry in inner_ext_entry["valueCodeableConcept"].get("coding"):
                                    dt_list.append(coding_entry.get("code"))
            dbgap_fhir_dict["dataTypes"] = dt_list
            # phenotypeIndication
            if fhir_data["entry"][0]["resource"].get("focus"):
                for focus_entry in fhir_data["entry"][0]["resource"].get("focus"):
                    if focus_entry.get("coding"):
                        for coding_entry in focus_entry.get("coding"):
                            value = coding_entry.get("display") if coding_entry.get("display") else coding_entry.get("code")
                            if dbgap_fhir_dict.get("phenotypeIndication") and value:
                                dbgap_fhir_dict["phenotypeIndication"] += f", {value}"
                            elif value:
                                dbgap_fhir_dict["phenotypeIndication"] = value
            # numberOfParticipants
            if fhir_data["entry"][0]["resource"].get("extension"):
                for ext_entry in fhir_data["entry"][0]["resource"].get("extension"):
                    if ext_entry.get("url") == "https://dbgap-api.ncbi.nlm.nih.gov/fhir/x1/StructureDefinition/ResearchStudy-Content":
                        for inner_ext_entry in ext_entry.get("extension"):
                            if inner_ext_entry.get("url") == "https://dbgap-api.ncbi.nlm.nih.gov/fhir/x1/StructureDefinition/ResearchStudy-Content-NumSubjects":
                                dbgap_fhir_dict["consentGroups.numberOfParticipants"] = inner_ext_entry["valueCount"].get("code")
#         print("------------------------------------------------------")
#         print("dbgap_fhir_dict")
#         print(dbgap_fhir_dict)
    
    # Reconcile information and create final results
    consent_code = coalesce(terra_dict.get("consentGroups.consentCode"), dbgap_fhir_dict.get("consentGroups.consentCode"), dbgap_xml_dict.get("consentGroups.consentCode"), dbgap_study_api_dict.get("consentGroups.consentCode"))
    if consent_code:
        consent_code = consent_code.upper().replace("_", "-")
    else:
        consent_code = ""
    consortium = coalesce(terra_dict.get("consortium"), dbgap_fhir_dict.get("consortium"), dbgap_xml_dict.get("consortium"), dbgap_study_api_dict.get("consortium"))
    dbGaPPhsID = coalesce(dbgap_fhir_dict.get("dbGaPPhsID"), dbgap_xml_dict.get("dbGaPPhsID"), dbgap_study_api_dict.get("dbGaPPhsID"), terra_dict.get("dbGaPPhsID"))
    studyName = coalesce(dbgap_fhir_dict.get("studyName"), dbgap_xml_dict.get("studyName"), dbgap_study_api_dict.get("studyName"), terra_dict.get("studyName"))
    purl_doid = ds_consent_map.get(snapshot_id)
    final_results_dict["snapshot_id"] = snapshot_id
    if dbGaPPhsID:
        final_results_dict["studyName"] = studyName + f" ({dbGaPPhsID})"
    else:
        final_results_dict["studyName"] = studyName
    final_results_dict["studyType"] = coalesce(dbgap_fhir_dict.get("studyType"), dbgap_xml_dict.get("studyType"), dbgap_study_api_dict.get("studyType"), terra_dict.get("studyType"))
    final_results_dict["studyDescription"] = format_description(coalesce(dbgap_fhir_dict.get("studyDescription"), dbgap_xml_dict.get("studyDescription"), dbgap_study_api_dict.get("studyDescription"), terra_dict.get("studyDescription")))
    if final_results_dict["studyDescription"]:
        final_results_dict["studyDescription"] = final_results_dict["studyDescription"] + "\nPlatform: AnVIL"
    else:
        final_results_dict["studyDescription"] = "Platform: AnVIL"
    final_results_dict["dataTypes"] = coalesce(terra_dict.get("dataTypes"), dbgap_fhir_dict.get("dataTypes"), dbgap_xml_dict.get("dataTypes"), dbgap_study_api_dict.get("dataTypes"))
    final_results_dict["phenotypeIndication"] = coalesce(terra_dict.get("phenotypeIndication"), dbgap_fhir_dict.get("phenotypeIndication"), dbgap_xml_dict.get("phenotypeIndication"), dbgap_study_api_dict.get("phenotypeIndication"))
    final_results_dict["species"] = "Human"
    final_results_dict["piName"] = coalesce(dbgap_fhir_dict.get("piName"), dbgap_xml_dict.get("piName"), dbgap_study_api_dict.get("piName"), terra_dict.get("piName"), "None")
    final_results_dict["dataCustodianEmail"] = ["help@lists.anvilproject.org"]
    final_results_dict["publicVisibility"] = True
    final_results_dict["nihAnvilUse"] = "I am NHGRI funded and I have a dbGaP PHS ID already" if dbGaPPhsID else "I am NHGRI funded and I do not have a dbGaP PHS ID"
    final_results_dict["submittingToAnvil"] = True
    final_results_dict["dbGaPPhsID"] = dbGaPPhsID
    final_results_dict["dbGaPStudyRegistrationName"] = coalesce(dbgap_fhir_dict.get("dbGaPStudyRegistrationName"), dbgap_xml_dict.get("dbGaPStudyRegistrationName"), dbgap_study_api_dict.get("dbGaPStudyRegistrationName"), terra_dict.get("dbGaPStudyRegistrationName"))
    final_results_dict["embargoReleaseDate"] = coalesce(dbgap_fhir_dict.get("embargoReleaseDate"), dbgap_xml_dict.get("embargoReleaseDate"), dbgap_study_api_dict.get("embargoReleaseDate"), terra_dict.get("embargoReleaseDate"))
    final_results_dict["sequencingCenter"] = None
    final_results_dict["piEmail"] = coalesce(dbgap_fhir_dict.get("piEmail"), dbgap_xml_dict.get("piEmail"), dbgap_study_api_dict.get("piEmail"), terra_dict.get("piEmail"))
    final_results_dict["piInstitution"] = coalesce(dbgap_fhir_dict.get("piInstitution"), dbgap_xml_dict.get("piInstitution"), dbgap_study_api_dict.get("piInstitution"), terra_dict.get("piInstitution"))
    final_results_dict["nihGrantContractNumber"] = None
    final_results_dict["nihICsSupportingStudy"] = coalesce(dbgap_fhir_dict.get("nihICsSupportingStudy"), dbgap_xml_dict.get("nihICsSupportingStudy"), dbgap_study_api_dict.get("nihICsSupportingStudy"), terra_dict.get("nihICsSupportingStudy"))
    final_results_dict["nihProgramOfficerName"] = coalesce(dbgap_fhir_dict.get("nihProgramOfficerName"), dbgap_xml_dict.get("nihProgramOfficerName"), dbgap_study_api_dict.get("nihProgramOfficerName"), terra_dict.get("nihProgramOfficerName"))
    final_results_dict["nihInstitutionCenterSubmission"] = "NHGRI"
    final_results_dict["nihInstitutionalCertificationFileName"] = None
    final_results_dict["nihGenomicProgramAdministratorName"] = coalesce(dbgap_fhir_dict.get("nihGenomicProgramAdministratorName"), dbgap_xml_dict.get("nihGenomicProgramAdministratorName"), dbgap_study_api_dict.get("nihGenomicProgramAdministratorName"), terra_dict.get("nihGenomicProgramAdministratorName"))
    final_results_dict["multiCenterStudy"] = None
    final_results_dict["collaboratingSites"] = [consortium] if consortium else []
    final_results_dict["controlledAccessRequiredForGenomicSummaryResultsGSR"] = None
    final_results_dict["controlledAccessRequiredForGenomicSummaryResultsGSRRequiredExplanation"] = None
    final_results_dict["alternativeDataSharingPlan"] = False
    final_results_dict["alternativeDataSharingPlanReasons"] = []
    final_results_dict["alternativeDataSharingPlanExplanation"] = None
    final_results_dict["alternativeDataSharingPlanFileName"] = None
    final_results_dict["alternativeDataSharingPlanDataSubmitted"] = None
    final_results_dict["alternativeDataSharingPlanDataReleased"] = None
    final_results_dict["alternativeDataSharingPlanTargetDeliveryDate"] = None
    final_results_dict["alternativeDataSharingPlanTargetPublicReleaseDate"] = None
    final_results_dict["alternativeDataSharingPlanAccessManagement"] = None
    final_results_dict["consentGroups.consentGroupName"] = snapshot_name
    final_results_dict["consentGroups.accessManagement"] = access_management
    final_results_dict["consentGroups.numberOfParticipants"] = coalesce(terra_dict.get("consentGroups.numberOfParticipants"), dbgap_fhir_dict.get("consentGroups.numberOfParticipants"), dbgap_xml_dict.get("consentGroups.numberOfParticipants"), dbgap_study_api_dict.get("consentGroups.numberOfParticipants"), "0")
    final_results_dict["consentCode"] = consent_code
    final_results_dict["consentGroups.generalResearchUse"] = True if access_management == "controlled" and "GRU" in consent_code else False
    final_results_dict["consentGroups.hmb"] = True if access_management == "controlled" and "HMB" in consent_code else False
    if purl_doid:
        final_results_dict["consentGroups.diseaseSpecificUse"] = [purl_doid]
    else:
        final_results_dict["consentGroups.diseaseSpecificUse"] = [consent_code] if "DS-" in consent_code else []
    final_results_dict["consentGroups.gs"] = consent_code if access_management == "controlled" and "GS-" in consent_code else None
    final_results_dict["consentGroups.poa"] = True if access_management == "controlled" and "POA" in consent_code else False
    final_results_dict["consentGroups.nmds"] = True if access_management == "controlled" and "NMDS" in consent_code else False
    final_results_dict["consentGroups.gso"] = True if access_management == "controlled" and "GSO" in consent_code else False
    final_results_dict["consentGroups.pub"] = True if access_management == "controlled" and "PUB" in consent_code else False 
    final_results_dict["consentGroups.col"] = True if access_management == "controlled" and "COL" in consent_code else False
    final_results_dict["consentGroups.irb"] = True if access_management == "controlled" and "IRB" in consent_code else False
    final_results_dict["consentGroups.npu"] = True if access_management == "controlled" and "NPU" in consent_code else False
    final_results_dict["consentGroups.otherPrimary"] = consent_code if (consent_code and access_management == "controlled" and not (final_results_dict["consentGroups.generalResearchUse"] or final_results_dict["consentGroups.hmb"] or final_results_dict["consentGroups.diseaseSpecificUse"] or final_results_dict["consentGroups.gs"] or final_results_dict["consentGroups.poa"] or final_results_dict["consentGroups.nmds"] or final_results_dict["consentGroups.gso"] or final_results_dict["consentGroups.pub"] or final_results_dict["consentGroups.col"] or final_results_dict["consentGroups.irb"] or final_results_dict["consentGroups.npu"])) else None
    final_results_dict["consentGroups.otherSecondary"] = None
    final_results_dict["consentGroups.mor"] = None
    final_results_dict["consentGroups.morDate"] = None
    final_results_dict["consentGroups.dataLocation"] = "TDR Location"
    final_results_dict["consentGroups.url"] = "https://data.terra.bio/snapshots/" + snapshot_id
    final_results_dict["consentGroups.fileTypes.fileType"] = coalesce(terra_dict.get("consentGroups.fileTypes.fileType"), dbgap_fhir_dict.get("consentGroups.fileTypes.fileType"), dbgap_xml_dict.get("consentGroups.fileTypes.fileType"), dbgap_study_api_dict.get("consentGroups.fileTypes.fileType"))
    final_results_dict["consentGroups.fileTypes.functionalEquivalence"] = None
    final_results_dict["consortium"] = consortium
    
    # Return results
    return final_results_dict


#############################################
## Input Parameters
#############################################

# Specify the users to manage access for and the role they should have:
snapshot_id_list = [
    '5184edeb-81f8-406b-926a-64604090904e',
    'd091a2a6-53e7-4721-82b8-09ccef9b13cc',
    'de2da97c-3a14-4a6d-b50b-5dc8e1af2803',
    '2c441f75-dc1d-4674-9118-a93c5141b748',
    'd0709a13-9701-437d-848f-fbce26b3bf5b',
    '7e748fdb-7dd9-418f-957c-7a68f07aaa8d',
    '824afdf1-50d9-462f-9f09-db5a1f646bd8',
    'b9314197-1618-4dd7-8441-38dfb1490389',
    '761e172c-f530-4154-b5b6-a1c52b0530e6',
    'e1c34b81-2435-4c12-87d7-3f995cfd4a0a',
    'cab35bdd-4b15-4836-8470-b922d5761602',
    'cd19195f-25a0-44b1-b47d-ec99141833fc',
    '9a61b980-4a33-465a-bc50-1aba00bc2cf6',
    '737d454c-88be-477f-ae2c-ef473e2106ce',
    'dd2b61fb-d420-4a38-9cd2-8464f51d7617',
    '757824d3-599f-4fab-985d-9ed847d06a62',
    '768c753f-6c78-4de0-98f8-80ee7878f23c',
    '1d385cfc-4bed-4f52-8f7b-ea54fc44b4f7',
    '7dead1cc-1830-4ca4-bbb5-cd9919f46f5b',
    '56876495-51a7-4d6e-b6ab-46f3da5b8100',
    '2e6bd160-8851-426f-be6d-8eb61bd142d2',
    '4d995f30-0c7e-4a98-88a7-1a7e58cbeef2',
    'b7a9f284-01a1-47a6-a1bf-57ce7b2e674b',
    '8ee64987-5785-4b05-b5eb-5ee4e074c558',
    '428c8260-1b27-446c-8484-a28341b41dcc',
    '20a5cdc2-bd2d-478c-8398-2b219565c290',
    'faa97a6f-3a33-4cf0-b6dd-c29fea9b1398',
    '1b20f271-5af5-4b72-8a81-d31ac8fac5f9',
    '328745cc-e527-4780-af6f-30ab69d26702',
    '0b0de78b-bd70-4a78-8963-9e66f04b86d1',
    'e4cc5f9f-a277-481e-9563-6d30035578ef',
    '804f1129-2ad2-4f0e-8751-a60ccb14bdbf',
    '1c9ddf6e-e641-4b2b-84fc-c7966efa1e66',
    '6d70ee06-d809-443f-b018-0bc23cd880ea',
    '00d059f0-afc3-4c72-a61d-6925194e220d',
    '794ab48a-2707-4713-937f-492b01af56d3',
    'b7731031-2e57-4948-8900-a6d549c3fd1f',
    '5fb13a59-b09f-42c2-9a02-9be2e9d16e91',
    '689891c3-a977-4aa3-a507-6343e177eedc',
    'f6d8456b-4ebd-487a-a71d-00337cc0c7a0',
    'c9d30b32-ae82-475e-a8bc-d88e0c489aee',
    '6c22fd04-7226-4aba-900e-0060ff255b0a',
    '8e73f31d-403a-458a-a1d2-c9048c24310b',
    '1415eb56-449e-473e-a8bb-f8616c1ff851',
    '6dcadf4a-71db-498f-87be-3b6bcec912e5',
    'e31b204e-f42c-4774-a239-91968b13a682',
    'f2797094-662f-4041-b373-338d89ac5a7f',
    '0cd0b54e-de28-406e-a0e1-53ac23a0e683',
    '26df2a34-b10d-4361-ba2b-d9f966d09f61',
    'ba915a8d-24d0-4a94-9220-4f1d058521a1',
    '6511b7df-04a2-499c-b940-7aa6e337abbd',
    '59e23ef1-8c18-42c3-a075-b5e5e5e16dec',
    '2e342ea0-83e3-4df9-ad9b-867ce04c14b1',
    '41ded0db-2a2a-451b-8a9e-0d94c2a81fe1',
    '7e63fe80-9ae7-4c57-b87b-963ef7999c64',
    'e0bbd924-bd13-44c7-946e-d89b6becc627',
    '0254cc08-1474-4b3c-ae99-f7d853042dc8',
    '531db83e-3f7f-4732-81f0-013addbf2a8e',
    'f1c03eab-24bc-4b3a-8aa9-d6696dfaaf31',
    'f875fdc4-f57d-4a4c-9b22-daf101156d26',
    '27068295-b3c0-4260-9447-9ca96814d46f',
    '2e6dce09-c48f-4aa4-8d76-a4c8bb53b4c7',
    'e588585c-4e81-422c-9058-746203958824',
    'ff2e3cd4-44c4-4068-9e57-3023a3e533c7',
    'd658a2fc-808d-479b-8aed-8f2a3f2993dd',
    'fd57042b-4676-49ba-9d2e-161c83e0f3bf',
    '32025456-3114-4140-a712-d38122f3ee71',
    'a39fc400-2146-4949-9a94-fd3d4f1b182c',
    '02706895-171d-403f-9f36-fa7e45d09a9c',
    'dcc578ed-44bb-458f-8ff5-a78ca83f4616',
    'aa42debe-3747-4dcd-8bc9-24eb90673fa5',
    '085ec55f-db41-4669-86fa-aaa95e430b95',
    'cd1181b4-b3ed-4c78-94af-aed1edbc64b1',
    '989ccbba-f39f-460a-95d1-e7542529c26c',
    '988c0084-bf6e-4838-846c-373a01f3458c',
    '47e551d2-0850-4fdd-98dc-cf01eb6ed839',
    'a2da748b-fec8-4e10-88ee-de32cbe8dee1',
    '5f95a5f3-319b-426c-afa0-2b4d1773411d',
    'eb719163-2dff-41a0-811e-3c00d182a7f6',
    'e5d2f3f5-7bac-40d2-a127-1e82a658df52',
    '28dc8121-5e55-46c2-8313-681de2298986',
    '060c707a-2f0d-4730-bbd6-d25489abfcf6',
    '6a242848-a716-4de9-ab38-3c82983810a8',
    '7e59197f-b859-4279-add3-de24bbc7e52b',
    '6cddc085-3dc4-4c30-bbe0-f56b3d34eb4b',
    '1eaeebfe-c381-4419-bd1f-24d0ef0b34a7',
    '624fef99-e4ce-4c12-a3d9-90995b5da970',
    'a68d3145-81c2-41f8-9944-5e4a5058934a',
    'a3b18d45-96c2-4526-8fde-65ab3265868f',
    '3ec72891-87d2-431f-850c-e52013330ea8',
    'ea82e45a-b5f3-41a1-a392-08cb3ac6d585',
    'e05f5d24-4edb-40e1-a293-533f33c2c86d',
    'ea90e903-9835-481e-b3c1-7451d2211de7',
    '6b8b2cc4-be14-443e-bda5-eed5fe0ffb2e',
    '30851e99-bbd0-48d3-b4f0-e3525b0506ca',
    '50a37ecf-071a-4f8f-9c72-70280973f9eb',
    'ad71f2cb-a73e-463c-b0c2-560fa0f7bc67',
    '895f4ecd-fdda-4e85-8fee-be0721b74184',
    '369de272-7e29-4f76-8f26-87d4d941fa38',
    'e9dcabec-7cc0-482b-83a5-f596e7a98db0',
    '899bc1e0-c708-4ebe-8b79-b7e1984995d7',
    'ddf7ee7d-3234-4f8d-a1e4-305588cd1009',
    '5cd2e542-1090-4dfb-a7a5-b276b32e58dc',
    '9a15fed4-ec1f-45be-b468-6e980c3bbbaa',
    'f2480e7d-609a-4f35-8f67-9f02561928f7',
    '556b008a-083e-49a5-bb70-b80b5799e8ea',
    'cc8cc17d-1ae4-4303-abd2-4728a676e5a2',
    '613ec6f4-dafb-4689-b109-4573ddca5853',
    '96160d85-eca6-4b6a-ab7f-d33dffef013c',
    '3c672fd0-d723-49f4-b2c6-d24d2658a049',
    'c68b342e-35c9-4fe5-80ef-2bb821a942bd',
    '44373227-8b15-4524-9ecd-57592c52a6f5',
    '6e293720-2935-467c-b5fc-0f257eb1fb68',
    'e3823ffe-3070-47b2-a0fc-7c0138e6c61a',
    'ab1d91ba-6aa6-4a40-8c10-2a979cfb29ab',
    '1bb35cec-4174-445b-a646-ff707abc2fd9',
    'a632ce72-5246-48fa-a140-f97ee6e9d9b2',
    'f87a7821-288b-4bc3-93a2-94ae34604540',
    '4a532da2-a8cf-4d5c-9e0c-93c0a1af9084',
    '6e6c8a5c-48b5-46d5-856f-28385f67e0fd',
    '78cc6f1c-6d17-4344-98d2-e18b0fde2365',
    'ae101395-36eb-4d59-9970-6696b82057db',
    '8f987445-a04a-4121-ab2e-c34cc8dce719',
    'f4a05db7-ff6b-4d75-8e87-68628830160d',
    '5c65174d-ed85-405a-96ca-5a41e5930265',
    'd431721f-060a-4b9a-b4d5-0d19fbf6ae0e',
    'f185a14c-aab5-41e5-a891-74d9653e3e0a',
    '6c392a22-a8c7-4e5d-a174-01026284dad5',
    '5780d857-a368-4f7c-88d8-2d145552a01f',
    '12c4738d-4d27-4776-b7d1-73a6b74fa56a',
    'e95d4773-7a36-4031-ba31-920856187300',
    'acf4504a-eb85-4aba-9ffb-1baa7266ad82',
    'a37d9def-52ca-488e-9468-8e2e211fb3d5',
    '658b1d66-1dfd-4c45-8b54-737a877cff74',
    'd0263ca8-b8c1-4b10-9977-3558104c9154',
    'd70b16e2-ffe1-4e63-837a-1f3e392e9f35',
    '3c3f273c-2904-4900-97aa-6638e796598c',
    '3984cfaf-0034-4b7e-ae21-8ae9810a62a1',
    'f043891a-8919-4e90-8008-9c38c6fbf312',
    '5c8ddfe9-2abe-437b-93ed-409c3ea5f488',
    'af39da4d-e004-42dc-b640-a0c27d6ba0ce',
    'c3856d07-55e0-412e-9c36-6363e9520e18',
    'cb350574-0522-45fc-b592-181a86cb4d17',
    '42d2dc1b-9ffd-41ad-84b9-b92ed984470f',
    'f0abbd9c-2c11-4d9b-be8d-de19f18ddfd0',
    '728f209a-ef9c-4303-a93f-a7958dc40f0c',
    '14bcf9ad-86ff-4983-967b-2a1ce86ae864',
    '0ffa30ef-91b1-4908-b148-58191f64c97d',
    '68b17a9d-48d2-4996-a3e6-3feb85011706',
    'c8c3bb66-e4bd-456f-9d38-e82816118807',
    '4d39a01d-0ed6-42b5-9200-91b0d848a42b',
    '5fb6214a-9594-4ef7-b1a6-d2efd7fb5c87',
    'e0e41b16-d394-4cb1-848d-fcaff4a8eac7',
    '2d5d9ec7-b748-4ae2-806f-f4bd687c36da',
    '8eb8326d-a74a-4bee-b4ea-b1d211114996',
    'aea7b522-eb2c-451b-b7fa-7bd932f1b971',
    '0b2be5d6-4fe1-4afb-9106-2f4dbca31d03',
    '63363aed-e5ea-4ba4-8962-da03369ca536',
    'b550b4cc-d3a5-4317-bfc0-5e46c77968c3',
    '11ecd102-9dc5-4cef-a838-a229b598fc76',
    'da06cf38-5f72-439b-9464-fb5448bb6d6b',
    'cd24ca43-95f7-432d-b729-3b62d9f95324',
    '9fd7edab-f1b6-4fe8-98fd-4ea4c2d34501',
    '15227f2d-06d8-4b02-89f8-e59ae4057f88',
    'b84e9146-b4d8-4685-b9b1-541b2da269bc',
    'f61d6193-468f-45ed-bf0a-75a5662871bd',
    'be47d532-a9d8-4a86-bf58-00b2920dd320',
    '5630c567-3752-4fa2-8124-c1b8bca37aca',
    '52f35032-2afa-4722-983e-e88c8cb808ac',
    '51865c0a-9548-4fea-a6e5-c8754a0bb085',
    'a1dcd80f-6390-489a-a34a-168f26690a36',
    '4fe793f8-96bd-4ece-a8b2-1e4fb6712b99',
    '5250257d-e2a3-4cc5-bb07-aa8b03421ad0',
    '9a17587a-ae6b-481a-8d88-f479981c767f',
    'af99a317-e7a6-4e0f-88fb-f2a6c438ca5d',
    'c3bb5d5c-dad2-4762-ac97-a8d920b414b5',
    'ee427556-94a5-44d9-84d6-322ed4419ac2',
    '213a8eb7-5c74-461a-9677-e04e978cd7e3',
    '2c670fb3-af55-4a30-bb31-4ebab5a0d3fe',
    '9807332e-22e3-41de-bc41-a9944ba364fc',
    '7bedbdb4-ec95-4011-8464-cfb267ff343a',
    '29471c9d-7165-46d1-adf1-6a40ed905354',
    '807e1239-6442-4a3c-a453-7919033aa03a',
    '0d85a6bc-fa74-4933-8537-61d4792159ee',
    'da818a37-2a60-4315-a6aa-333ea00e9e6f',
    'e7c1e9aa-dab7-4a15-a9e2-cecbeb6989a9',
    '208c3ef2-a34a-45ee-858c-38c9dcf86396',
    '4dc90a20-5c4c-40da-9b36-beaf740f8983',
    '9bc462b1-2cb5-43e6-82bc-ec257ed35455',
    '569905e0-65a3-4e70-b30e-8c731568c443',
    '0c011d7f-1aad-49e7-8033-d6b036153f46',
    'a0408818-ad55-42fa-a1b9-84537a4b3eed',
    '48df4d69-f578-49d8-a320-0bec2fa5711b',
    '49b3d29d-f734-43e8-9454-ea3ab9631341',
    'e6dfa202-d2a0-407d-be70-84cb53c9f9ec',
    'c191a23a-926c-4a61-8294-27496a41a4da',
    '749af3ac-a652-445b-a2c0-80f24aca15f8',
    '3e4fefea-7935-4a4c-bba5-84109c9a800e',
    'afd608ce-943f-47e1-8d80-fdf43d58812d',
    '2fb44dc8-06a9-4990-914e-63479c185299',
    '4e03ae15-3680-4690-95cf-336a86ddd7a5',
    '2e8e7c13-3c64-4686-a5fd-0b664bf8510e',
    'f461ebf3-239b-445e-9540-7b15b64998c5',
    '356dc4cf-688c-4299-b4a0-9c3d839c1490',
    '0986817d-bbf8-4614-89b4-68ca7c69b0e1',
    '6e429241-ea4e-4273-a92e-3d4978b55047',
    '617b50da-87dc-47e2-813e-9271378f3280',
    '968929f0-e200-4b68-afb2-f0656d5d6bfc',
    'ed56f6ae-6c43-4e1f-b3cd-746e03a29316',
    '6c57d44a-2bf2-4b27-aa51-fe341357ab84',
    'db2370df-cc70-4a6a-9146-fc99ff8eddac',
    '4602d8f0-a679-4c26-9b83-608d04abab99',
    'e036b126-1249-4661-98d4-db6218f351e8',
    'a08ee68f-0e5f-4cd3-ab88-b3740ddf709a',
    'e9fa838e-b173-4262-8fb6-e5eef53856ab',
    '8c634fb0-da0e-403c-8e4a-13cef21411a7',
    '636272e4-d4e2-4a25-ba10-e1d1cb9352bb',
    '0acc57c8-5c8e-44ab-bab5-6dcb7b6465ef',
    '63a987d4-bfca-46c4-bbbc-afdbf357308a',
    '538662ce-44d2-4fb9-ab3d-f02342d26761',
    'f58f9cc2-70e0-40fd-8adc-674adc503f8f',
    'ffc3e5ac-95e6-464d-92dc-1c4fd1ca394e',
    '7406b139-dae3-499e-94f5-b762fee73bd7',
    '500080ec-6911-4d78-942c-b0d4c7143894',
    '8b726cda-c018-45de-bb98-39915c912035',
    'ea9298ef-2d9c-4237-9a9b-48a8854ee042',
    'b6ae2316-8eed-49ea-ab6e-3425a9527549',
    '2a8eaabc-68e1-4962-bf1b-332f1b856a78',
    '2569d9ca-20bb-41d6-a7ad-505b7a2c33e8',
    '9333b7fa-71a5-4be9-bcb7-10334a87d698',
    'bd8d5afd-4316-4658-ab8f-568d90cf9517',
    '4fc3ed75-feda-4498-89b3-46024655704b',
    '6cdddd59-711f-4d72-8383-cfa349d58a3d',
    '423099cd-1739-45e6-9225-06bdabcba8f7',
    '14810a30-fbc5-459d-b2c7-0378125e25bf',
    'b1cb0a2d-9c11-478a-82f7-6b239a6b7ca0',
    '1320c44f-c27b-42e3-9870-5676d340e923',
    '1918b1ee-fa43-49a2-8e5a-d3730c0c20cc',
    '5681d110-8c84-478c-9d1f-7935a54b86ca',
    '24c427f6-17b9-4cd1-962f-92a12b090d8a',
    'f8b5dcf4-e9ff-42d5-860e-f36033d62522',
    '6173529b-c677-4fa2-9580-feda9fec3f4f',
    'e5424ee2-ebee-494a-ac5e-16d7c56453ac',
    '9a5be8f4-eaa7-4358-8fdc-470a6f1da79c',
    '6fdea8c7-69d9-466e-9fa2-aca30722ff68',
    'e577339f-cda8-46d4-9c3f-aea5ac154c43',
    'a218159b-1333-4550-a3e0-bf8610425fd8',
    'ad6660ad-3052-4f68-8e8a-febd57adb43b',
    'e3797059-80ed-463f-89ca-e77589f2fdb4',
    '87d02347-d169-4ce0-9027-3c8e11e48c40',
    '8857ce53-0bb6-40be-a536-3dc658723419',
    '40ebc4a1-94ea-4b5b-adeb-89b171f2a957',
    '61b6ae23-ca19-4d31-bad3-2281a8528886',
    '7c4edc65-bfe6-4ede-a68a-c0b9d2564f29',
    'e511ebe0-d2a3-42ad-b06a-ef083bb6d943',
    'f330517e-46fd-4de3-8063-015b524a7324',
    '2529f127-cff5-43ff-b879-06bc0e3468ff',
    'f0d8bb27-1695-4faf-8b27-4b95260b8f17',
    '17d14df1-cb64-4aae-8049-c1728a3c0c81',
    '434f85e2-4435-483c-8099-b03c8ba794ed',
    '148bba37-06ab-40b7-a0d4-a04fc515465d',
    'b5271312-1c86-4336-b039-4216f95e298c',
    '62d2e36d-487f-46a2-9938-8910c4c33f65',
    '5bba97dc-d6ab-4329-912f-148c8b807056',
    'fa2552f6-b6f8-478a-8fc6-19fb6d612837',
    'c3d22305-b3f2-4561-a5b9-bed82ee742f4',
    '9fe2abd4-70b4-4eee-b00d-38726ced8620',
    '5329c25e-ccad-435d-9250-6fcc3ff88472',
    'ced601b2-9a11-40e9-8067-241e5a5996ed',
    'fc513b58-cfb7-4871-8694-8dc372fc2e10',
    '9cf61d88-d096-4981-b0c6-99db77554c01',
    '4c722626-c559-4f5a-84bd-8d7d46983e1e',
    '7c237e08-3329-4e64-bd2a-063be290e78b',
    '4117144f-92e7-454f-9263-dad5e128cadb',
    'ce2e7235-26e6-470f-8e05-298193b7f53d',
    '6df525e1-b143-4e6f-b667-80c783ae1b66',
    '92666b7c-4d50-4530-88e9-ea2d3da9d07a',
    '42644c25-fa23-4b4e-8fcc-907cd8dcef60',
    '155c11a9-638a-45c8-b172-7cf2e3e16fe6',
    'b3da9fec-08ad-4496-a9ac-1411388fb5cc',
    '0de07296-e3ff-4fe6-9183-9f421484197c',
    '1b6273c6-7769-4daf-abee-93b11b322c73',
    'ea50255a-45a4-4846-82e3-02b4f46f5b17',
    'eb7045e1-2286-49f1-bce6-21b5d7fa5c32',
    'b763c288-4132-434a-a6c9-25ad51b9d961',
    '079eb53c-e2b6-4da6-ab5f-fc2136a3ecc1',
    'ea4c6dab-ad3d-40f0-95c9-e68ee79f7a6c',
    'd3dc5627-503b-48a5-ad79-31ab6c2fd417',
    'ec14f8cd-5b1b-4124-a235-f11159984c7c',
    '6d9e1212-4fa6-4632-be8a-75c45a474dd3',
    '667eac9b-4e90-413d-80f3-d857b9829ab7',
    'cdd689fd-10f3-4cfa-b738-46549e689cac',
    'c091ea30-1862-4b1f-8e92-087b441472c3',
    '43c86818-9bfe-46f2-9ae4-4a55a7baef1f',
    'ebdaca04-ef29-42f3-8486-a94dade81bf8',
    '8165245c-2003-4ec7-bf57-731959022d47',
    'c4dcf7e0-195a-4885-8864-55a9d65cdb5f',
    '77fbf845-e43c-4015-93d3-6acf55d83022',
    'cea0dc44-e5ad-4116-aaca-d4c0dea68547',
    'b052703f-ad71-44ef-b76a-654cc13fa97e',
    '510abca2-02d6-4773-909e-70746a444987',
    'db6d79e6-6064-4619-9e49-d3ee054c8302',
    'a2c0bd50-4f89-4f1e-b25e-0f0c56b29b31',
    '18dde45b-410e-4046-a051-46885a21c02d',
    '30d1fb84-6746-4d0d-8d68-f1c9cf955504',
    '5d6cc84c-f03a-485a-8f90-1b44c1fa55c9',
    'ea0e5966-6573-4e1f-bd11-48f64595fadf',
    '1a26532c-16e6-4f1c-81f9-8f07a8181421',
    '28559e94-ed57-48c8-bc8b-6cc4ad659a61',
    'd67d11a8-4356-4cae-89d2-92e724f93f2e',
    'b78a77b3-0a1b-410c-8afe-193d277e645b',
    '2a1375fc-a976-4327-829f-d0d0f6155cc5',
    'ce1bf5c3-525e-455d-a1e9-dd5f3d68c9d3',
    'd0a6aa4c-821c-4bba-b53b-4f230ca3cda4',
    'c6262801-594b-42d1-bf08-154f64cd76d5',
    'c9ae3cd3-2174-4e76-a610-a54c95378a98',
    '3ac713b5-3645-4381-ac66-ecbc281a2ab8',
    '4911bd18-5db9-418a-9dc0-0ea28ae937d6',
    'c5f294ea-87e9-4cbb-8099-2b5401add5a2',
    '5fb3cd44-691f-41ef-a009-5a401b5fcae5',
    '2302effc-1f0d-4618-a360-543e1892a549',
    '33c854eb-d228-4a82-8324-5e455ed1e447',
    '533ba93b-506e-4547-9174-037a6b17835d',
    '06216d97-7d1d-4105-bf60-958b71c02cfd',
    '51e19b3b-8a51-4e2b-8a9a-bcbb95921a28',
    '39e02242-e1bb-4937-b3ee-d7f81e094d75',
    '87b55203-983f-467e-b496-9a0d21f4151d',
    'f06adf86-4526-47a8-b59a-2bf137e034d2',
    'bbd04481-0b9d-4c21-ba65-a43638116e0f',
    'dbdfebae-3eb0-4fc5-b744-eb901da3591c',
    '29fa069b-8df8-4fb3-bfa8-01e0504d050c',
    '2b78a3ac-8bca-4938-bc7c-26a60f9c04ac',
    '4bb891fc-fcae-40cc-bf59-73716de7e04e',
    '508b9f8a-c827-4dc0-8319-6aeb90482bdf',
    'bfd29198-ca9c-481a-ae8e-d8ec49bdf84a',
    '56187783-02d4-46f8-bc8a-cce00125ce58',
    '20eb6baa-99b8-4e24-97e3-98a402fbe975',
    'f6da1eb2-9dec-48fd-abcd-d98bf2d21e47',
    'd370b858-4fb6-413e-8bfb-97f98e8f3d77',
    '574e0d42-e712-4a86-be7a-4b3a95187bcd',
    '56078c29-a393-4c60-9e04-3674e02fe729',
    '099d2585-1379-4333-b3b1-ffc0d26d95c5',
    'a35fc432-b9ba-4633-bef7-4e317ff34df5',
    '90fe2016-e79c-456c-a5f9-3a31149fcd65',
    '3bdbad9e-f9d4-4442-8606-791d490bf0af',
    '02d25240-823f-4b1d-8562-95385716a453',
    '1974a21b-c409-4736-a3d7-e195fa96c4eb',
    '99b46287-4790-492c-8a12-bea33f0f927c',
    '08d19a7e-b868-4766-9f7e-d879d972cbd7',
    'c6ef5822-3929-4ae7-b5bc-dc27528bf226',
    'e43974fd-cee1-4d8c-a436-6846d7d24129',
    '0d607d21-c9c7-4852-83e3-76825176ee0a',
    '0a356156-961d-4829-b9b5-c07fbc73dacc',
    '4c8ce027-8094-4f5d-bf62-22b1d51b3c1e',
    'c5a872a4-1b04-4608-8fcf-92aabf63e4a1',
    'eb7948be-1007-4b0e-b9b6-a5c40bbb9596',
    '07d2b703-db1f-418a-97ed-eeeab088718f',
    '7639a9e0-275c-49a8-80c1-cdb01ce23e1c',
    'c1c674dd-056a-470c-8874-bf70d8fae3a8',
    'aa2bfacc-c28c-4192-960c-b1389cf68516',
    '5b036d13-e058-4d8d-be91-6fdd070686a7',
    '8fd5b447-77b6-4c33-b66a-a5cc63587220',
    '541aa72b-7771-445c-8abf-6620f54f881b',
    'b7b2f00b-5bac-4996-a23a-1df0d4099157',
    '410667f8-8811-47bb-b5cd-ddacba7185e5',
    'd7349942-f8ff-4ad6-b075-8f39652a7789',
    'c51470e4-cb99-45d7-8ffc-3d346e557b4d',
    '44b1f60b-e74c-4430-9378-d4a75e2de72f',
    'a4c62d7f-34f0-4e2e-9e46-c762d3ab0ff2',
    '6a5b3be6-d1de-4f23-a431-b08e7ab231b8',
    '71a30dc1-0dba-4b78-9cdc-77634b07d8da',
    '5208772d-21f9-46b0-8167-0b05b57296b8',
    'ffe34538-3ddd-48de-b4a2-94f9b2dad086',
    '0f6db24a-05d6-46fc-9ff6-795e29d10ca5',
    '6b83b9e6-c276-4be3-b459-d99dae094508',
    '5955a235-5be6-47bc-8303-ed0c4e68f501',
    'b95644c0-34f1-42db-93b6-4e22ea7f4701',
    '36690013-e8bc-43a5-9ba9-83317537557c',
    '172bada7-f1c5-41c4-836d-05381beaed9a',
    '9a1e873b-b1db-4d3e-a83b-ed6c5b3f3ecc',
    '2c6de04e-104d-42c8-8448-97d74985dacb',
    '452bcafd-ab45-4e24-b5e0-13fcf22b0755',
    'fbafdd31-21a0-44c5-ae4d-724839beff61',
    '2a1882d9-88ca-4849-bcc1-f6914f593407',
    '3838993f-59ba-4dec-8110-ac3ea387ab91',
    'bf2f4106-cee9-419c-b4d1-d7b03a6293d5',
    'a6c36f5e-b86c-4164-85ae-8bf0df2e4a90',
    '5e547934-c339-410e-a013-dfefed50f4b8',
    'ffa84feb-ca0e-43d3-a04d-a402a8e24a3b',
    '2be072bd-2153-4050-9358-e4b95297a9bf',
    '392ea719-6bc9-46ab-8a29-33f7521ae3d6',
    '1851ecd5-5e95-4ca4-afe4-9493d2dc55c0',
    '7c19d852-e36a-4353-afea-10e501601d9a',
    'fd3843fe-ee5d-4784-b0d2-6673f9886d30',
    '84703c54-a9dd-400c-9701-2fc40922e3e3',
    '00297802-e20a-413f-b389-a6f764b6600e',
    '70a527a0-ced5-44d5-9c43-cb57911f5042',
    'f2a7be5a-4f7a-4a96-935e-ca7592855b45',
    'c853d4c0-d4be-433d-964e-e30bdc35480e',
    'ff27037e-cb52-44ef-8979-f6e7ac3ed6f6',
    '830df9ed-e4a6-4c9a-a97a-aa080fb030e4',
    '9321b908-f2e4-437b-b53e-ed81754dcace',
    '7c90289b-be3e-4c9b-917a-d5e27d95dc15',
    '3e85b06a-a6ea-4ce8-a655-44b1fce12138',
    '40c4297e-d492-4f6a-b651-ee9ee38db14b',
    '40c4297e-d492-4f6a-b651-ee9ee38db14b',
    '40c4297e-d492-4f6a-b651-ee9ee38db14b',
    '8956cc4d-58be-46ae-a81e-74607ffbd9d3',
    '03e54581-8fd3-47c3-9143-55368d2e4e86',
    '9efd748c-ad09-4765-b645-1b6ef6b5d402',
    '9efae3c7-904c-48a8-939a-e82b46005ae1',
    '0f46a588-b4ff-4a69-99e9-0a0bcf052522',
    '6e674477-522f-4adc-8c50-76910a6a282b',
    '504089f1-c59d-48fe-84ef-858bd3eb3043',
    'e91ccc70-2772-46d8-b586-cf3e270a05b5',
    '247dacae-8e6e-4928-ac45-421d19b549a5',
    '1c4015ec-c6bf-48c6-9a8f-144498bac5ae',
    '4645cb09-c96e-4750-bc8f-b9ba2e61f2fa',
    '4b5e77cb-b79c-4a6e-aa67-ab23afa9d141',
    '1254ef3c-3f58-41c9-a52e-eafef4492c13',
    'e224f92f-0b40-420b-8e7a-dbc268107bc2',
    '8873dbd4-e4a4-45db-8908-3c68593031d6',
    '2185fe0d-9816-4f11-ab65-ee8f969847fc',
    'c5514b78-183b-4b72-9e32-2e473c63086c',
    '9251ff73-9932-4ead-ab96-91feb6c56935',
    '94abf97d-45c1-441e-8d6c-355e9557b9e6',
    '721a0e30-9c7c-4ea2-aa5b-d8a1416e60a1',
    'c0dbb47d-4159-44e1-9c4a-87a51390ed73',
    'd223db2c-cc91-47cd-9fb0-050e0e7940f0',
    'bccd9f0a-d1e4-4d48-9d20-91b293e4a57e',
    '1be2372c-07c7-434e-be09-1f3ff095ecf1',
    '825157b9-d84b-474b-90f6-0994e7bac378',
    '4dcde6d0-a57a-4fcf-8281-882f783d0583',
    '1a256e91-a1b4-4db8-91cc-3be204872b26',
    'd0acae97-256d-44ac-a55f-efe4f6cf2af6',
    '696b5a81-c93e-4acd-8448-4b5576d14ea0',
    'e00f95e2-92d1-41a8-9bc9-d4978ebc68f9',
    'c890024c-40ed-42db-ae45-b119d038461e',
    'cc033b0a-6285-426b-8d6c-f29739b62920',
    '4022a967-0753-4f74-a682-b980528c112d',
    '68af6886-c7de-4a2f-abde-0314a301ac1a',
    'ba1a01e5-23e3-417a-a45d-91368dce617c',
    '46bb697c-4b2c-4ae2-90d8-4fed2a00f831',
    'a0bbd011-9be9-4175-982c-b2d84abccb6b',
    'ea4e8c79-b6bd-4b24-990e-624de9d15835',
    'c98c2d47-ebe2-42cb-adc7-be2475812bea',
    '853f9898-8b51-49c6-995c-2a9fb1839d60',
    'f3ed7376-db20-4ee8-80e0-535b6ae6b770',
    'f45becfb-890e-4a71-9b87-ec1deeb8503e',
    'fc782fb4-c739-4531-a71d-9388443c319c',
    '3096d7dd-458e-4840-a7f9-715aedca92cd',
    'd72f1822-4f99-441d-961b-2a5cf6635f42',
    'cae2218b-5eab-47e6-bcde-226f212d2bb2',
    '7b8b09f2-80cf-49c8-bd74-42b00e850cde',
    'ce525190-7d7f-4e57-8176-398cd9b0b7c5',
    '5abde6df-37b9-4086-8dc2-e29fcacf330e',
    '9eb9cc94-bc62-4a3e-b25f-e8ecbbdb7d36',
    'd5c5d577-364b-45e4-a5f5-989223897216',
    '09c4a83d-9d0b-4cf7-b04a-0747e656e019',
    '15b35c76-49ec-4225-b91e-0ff0b43e8136',
    '3ca70728-bffd-4cd0-9bf2-7a479532c9e9',
    '1bb3d012-1637-4f61-ba1e-a8549a43973e',
    'c8745002-326a-4e17-84d9-3045cfcea085',
    'fd068962-fa12-435c-87b1-8baca1788839',
    '8996212e-d0e0-4305-8638-587cfb61bf8d',
    '93d56775-9557-4e88-bb15-007bc86181ae',
    '8bb8ac5f-cb35-4253-9516-d80bd581dcb8',
    '833ee2f2-9333-401b-8f53-fa4353fef66a',
]

ds_consent_map = {
    '4b5e77cb-b79c-4a6e-aa67-ab23afa9d141': 'http://purl.obolibrary.org/obo/DOID_0050589',
    'e224f92f-0b40-420b-8e7a-dbc268107bc2': 'http://purl.obolibrary.org/obo/DOID_0050589',
    '9251ff73-9932-4ead-ab96-91feb6c56935': 'http://purl.obolibrary.org/obo/DOID_0050589',
    '1a256e91-a1b4-4db8-91cc-3be204872b26': 'http://purl.obolibrary.org/obo/DOID_0050589',
    'ea4e8c79-b6bd-4b24-990e-624de9d15835': 'http://purl.obolibrary.org/obo/DOID_0050589',
    'f45becfb-890e-4a71-9b87-ec1deeb8503e': 'http://purl.obolibrary.org/obo/DOID_0050589',
    'fc782fb4-c739-4531-a71d-9388443c319c': 'http://purl.obolibrary.org/obo/DOID_0050589',
    '3096d7dd-458e-4840-a7f9-715aedca92cd': 'http://purl.obolibrary.org/obo/DOID_0050589',
    'd72f1822-4f99-441d-961b-2a5cf6635f42': 'http://purl.obolibrary.org/obo/DOID_0050589',
    '7b8b09f2-80cf-49c8-bd74-42b00e850cde': 'http://purl.obolibrary.org/obo/DOID_0050589',
    '743984f2-9ab8-4370-9449-1159ed255c15': 'http://purl.obolibrary.org/obo/DOID_0050589',
    '09c4a83d-9d0b-4cf7-b04a-0747e656e019': 'http://purl.obolibrary.org/obo/DOID_0050589',
    '1254ef3c-3f58-41c9-a52e-eafef4492c13': 'http://purl.obolibrary.org/obo/DOID_0050589',
    'f3ed7376-db20-4ee8-80e0-535b6ae6b770': 'http://purl.obolibrary.org/obo/DOID_0050589',
    '2c6de04e-104d-42c8-8448-97d74985dacb': 'http://purl.obolibrary.org/obo/DOID_0060041',
    '2a1882d9-88ca-4849-bcc1-f6914f593407': 'http://purl.obolibrary.org/obo/DOID_0060041',
    'bf2f4106-cee9-419c-b4d1-d7b03a6293d5': 'http://purl.obolibrary.org/obo/DOID_0060041',
    'a6c36f5e-b86c-4164-85ae-8bf0df2e4a90': 'http://purl.obolibrary.org/obo/DOID_0060041',
    '7c19d852-e36a-4353-afea-10e501601d9a': 'http://purl.obolibrary.org/obo/DOID_0060041',
    '00297802-e20a-413f-b389-a6f764b6600e': 'http://purl.obolibrary.org/obo/DOID_0060041',
    '18e8516c-5d3e-4b5e-916d-3c71b293a9d5': 'http://purl.obolibrary.org/obo/DOID_0060041',
    '3e85b06a-a6ea-4ce8-a655-44b1fce12138': 'http://purl.obolibrary.org/obo/DOID_0060041',
    'f0d8bb27-1695-4faf-8b27-4b95260b8f17': 'http://purl.obolibrary.org/obo/DOID_0060041',
    '7c4edc65-bfe6-4ede-a68a-c0b9d2564f29': 'http://purl.obolibrary.org/obo/DOID_0060041',
    '8ee8c22d-8bbb-48e5-abe2-c2c891a1ad48': 'http://purl.obolibrary.org/obo/DOID_0060041',
    '8aaef8d0-ca18-4790-aa1e-09b7417a4a6a': 'http://purl.obolibrary.org/obo/DOID_0060041',
    '36690013-e8bc-43a5-9ba9-83317537557c': 'http://purl.obolibrary.org/obo/DOID_0060041',
    'fbafdd31-21a0-44c5-ae4d-724839beff61': 'http://purl.obolibrary.org/obo/DOID_0060041',
    '3838993f-59ba-4dec-8110-ac3ea387ab91': 'http://purl.obolibrary.org/obo/DOID_0060041',
    'fd3843fe-ee5d-4784-b0d2-6673f9886d30': 'http://purl.obolibrary.org/obo/DOID_0060041',
    '84703c54-a9dd-400c-9701-2fc40922e3e3': 'http://purl.obolibrary.org/obo/DOID_0060041',
    '148bba37-06ab-40b7-a0d4-a04fc515465d': 'http://purl.obolibrary.org/obo/DOID_0060041',
    '434f85e2-4435-483c-8099-b03c8ba794ed': 'http://purl.obolibrary.org/obo/DOID_0060041',
    '5bba97dc-d6ab-4329-912f-148c8b807056': 'http://purl.obolibrary.org/obo/DOID_0060041',
    '61b6ae23-ca19-4d31-bad3-2281a8528886': 'http://purl.obolibrary.org/obo/DOID_0060041',
    'f330517e-46fd-4de3-8063-015b524a7324': 'http://purl.obolibrary.org/obo/DOID_0060041',
    '40ebc4a1-94ea-4b5b-adeb-89b171f2a957': 'http://purl.obolibrary.org/obo/DOID_0060041',
    '87d02347-d169-4ce0-9027-3c8e11e48c40': 'http://purl.obolibrary.org/obo/DOID_0060041',
    '079eb53c-e2b6-4da6-ab5f-fc2136a3ecc1': 'http://purl.obolibrary.org/obo/DOID_1936',
    'a3b18d45-96c2-4526-8fde-65ab3265868f': 'http://purl.obolibrary.org/obo/DOID_1287',
    'a632ce72-5246-48fa-a140-f97ee6e9d9b2': 'http://purl.obolibrary.org/obo/DOID_1826',
    'ee427556-94a5-44d9-84d6-322ed4419ac2': 'http://purl.obolibrary.org/obo/DOID_1826',
    '56078c29-a393-4c60-9e04-3674e02fe729': 'http://purl.obolibrary.org/obo/DOID_1826',
    '4e03ae15-3680-4690-95cf-336a86ddd7a5': 'http://purl.obolibrary.org/obo/DOID_1826',
    '6e293720-2935-467c-b5fc-0f257eb1fb68': 'http://purl.obolibrary.org/obo/DOID_1826',
    'b84e9146-b4d8-4685-b9b1-541b2da269bc': 'http://purl.obolibrary.org/obo/DOID_1826',
    '794ab48a-2707-4713-937f-492b01af56d3': 'http://purl.obolibrary.org/obo/DOID_0050589',
    '94abf97d-45c1-441e-8d6c-355e9557b9e6': 'http://purl.obolibrary.org/obo/DOID_0050589',
    'c8745002-326a-4e17-84d9-3045cfcea085': 'http://purl.obolibrary.org/obo/DOID_0050589',
    '833ee2f2-9333-401b-8f53-fa4353fef66a': 'http://purl.obolibrary.org/obo/DOID_0050589',
    '15b35c76-49ec-4225-b91e-0ff0b43e8136': 'http://purl.obolibrary.org/obo/DOID_0050589'
}

#############################################
## Execution
#############################################
dataset_details_records = []
for snapshot_id in snapshot_id_list:
    print(f"Processing snapshot_id: {snapshot_id}...")
    dataset_details = fetch_dataset_details(snapshot_id, ds_consent_map)
    dataset_details_records.append(dataset_details)
output = pd.DataFrame(dataset_details_records)
output_sorted = output.sort_values(by=["studyName", "consentGroups.consentGroupName"], ascending=[True, True], ignore_index=True)

#############################################
## Validation and Output
#############################################
# Create copy of dataframe for unique value validation
output_unique_val = output_sorted.copy()

# Convert study list fields to strings
list_fields = ["dataTypes", "dataCustodianEmail", "nihICsSupportingStudy", "collaboratingSites", "alternativeDataSharingPlanReasons"]
for field in list_fields:
    output_unique_val[field] = [try_join(l) for l in output_unique_val[field]]

# Get unique values per study-level field, by study
study_level_col_list = []
for col in output_unique_val.columns:
    if "consentGroups." not in col and col not in ["studyName", "snapshot_id", "consortium", "consentCode"]:
        study_level_col_list.append(col)
df_unique = output_unique_val.groupby("studyName")[study_level_col_list].nunique()
df_unique["unique_value_validation"] = df_unique.max(axis=1)
df_unique["unique_value_validation"] = ["Pass" if l <= 1 else "Fail" for l in df_unique["unique_value_validation"]]

# Create copy of dataframe for enum validation
output_enum_val = output_sorted.copy()

# Validate enum fields
output_enum_val["studyType"] = [val_study_type_enum(l) for l in output_enum_val["studyType"]]
output_enum_val["nihInstitutionCenterSubmission"] = [val_nih_inst_center_sub_enum(l) for l in output_enum_val["nihInstitutionCenterSubmission"]]
output_enum_val["nihICsSupportingStudy"] = [val_nih_ic_supp_study_enum(l) for l in output_enum_val["nihICsSupportingStudy"]]
output_enum_val["consentGroups.fileTypes.fileType"] = [val_file_type_enum(l) for l in output_enum_val["consentGroups.fileTypes.fileType"]]
study_enum_cols = ["studyType", "nihInstitutionCenterSubmission", "nihICsSupportingStudy"]
df_study_enum = output_enum_val.groupby("studyName")[study_enum_cols].sum()
df_study_enum["study_enum_value_validation"] = df_study_enum.max(axis=1)
df_study_enum["study_enum_value_validation"] = ["Pass" if l < 1 else "Fail" for l in df_study_enum["study_enum_value_validation"]]
consent_group_enum_cols = ["consentGroups.fileTypes.fileType"]
df_consent_group_enum = output_enum_val.groupby("consentGroups.consentGroupName")[consent_group_enum_cols].sum()
df_consent_group_enum["consent_group_enum_value_validation"] = df_consent_group_enum.max(axis=1)
df_consent_group_enum["consent_group_enum_value_validation"] = ["Pass" if l < 1 else "Fail" for l in df_consent_group_enum["consent_group_enum_value_validation"]]

# Join validation dataframes to original dataframe
output_sorted_validated = output_sorted.join(df_unique["unique_value_validation"], on="studyName", how="left")
output_sorted_validated = output_sorted_validated.join(df_study_enum["study_enum_value_validation"], on="studyName", how="left")
output_sorted_validated = output_sorted_validated.join(df_consent_group_enum["consent_group_enum_value_validation"], on="consentGroups.consentGroupName", how="left")

# Display outputs
print("----------------------------------------------------------------------------------------------------")
print("----------------------------------------------------------------------------------------------------")
print("Validated Metadata Output:")
display(output_sorted_validated.style.hide(axis="index"))
print("\n")
print("Unique Study Value Validation Results:")
df_unique.reset_index(inplace=True)
display(df_unique.style.hide(axis="index"))
print("\n")
print("Study Enum Value Validation Results:")
df_study_enum.reset_index(inplace=True)
display(df_study_enum.style.hide(axis="index"))
print("\n")
print("Consent Group Enum Value Validation Results:")
df_consent_group_enum.reset_index(inplace=True)
display(df_consent_group_enum.style.hide(axis="index"))


Processing snapshot_id: 5184edeb-81f8-406b-926a-64604090904e...
	Snapshot PHS_ID: 
	Source Workspace: 1000G-high-coverage-2019
Processing snapshot_id: d091a2a6-53e7-4721-82b8-09ccef9b13cc...
	Snapshot PHS_ID: phs001272
	Source Workspace: ANVIL_CMG_BROAD_BRAIN_ENGLE_WES
Processing snapshot_id: de2da97c-3a14-4a6d-b50b-5dc8e1af2803...
	Snapshot PHS_ID: phs001272
	Source Workspace: ANVIL_CMG_BROAD_BRAIN_SHERR_WGS
Processing snapshot_id: 2c441f75-dc1d-4674-9118-a93c5141b748...
	Snapshot PHS_ID: phs001272
	Source Workspace: ANVIL_CMG_BROAD_ORPHAN_SCOTT_WGS
Processing snapshot_id: d0709a13-9701-437d-848f-fbce26b3bf5b...
	Snapshot PHS_ID: 
	Source Workspace: ANVIL_CMG_Broad_Muscle_Laing_WES
Processing snapshot_id: 7e748fdb-7dd9-418f-957c-7a68f07aaa8d...
	Snapshot PHS_ID: phs001272
	Source Workspace: ANVIL_CMG_Broad_Orphan_Jueppner_WES
Processing snapshot_id: 824afdf1-50d9-462f-9f09-db5a1f646bd8...
	Snapshot PHS_ID: phs000693
	Source Workspace: ANVIL_CMG_UWASH_DS-BAV-IRB-PUB-RD
Processing snaps

Processing snapshot_id: 531db83e-3f7f-4732-81f0-013addbf2a8e...
	Snapshot PHS_ID: phs001546
	Source Workspace: anvil_ccdg_broad_cvd_af_natale_tcai_wes
Processing snapshot_id: f1c03eab-24bc-4b3a-8aa9-d6696dfaaf31...
	Snapshot PHS_ID: 
	Source Workspace: anvil_ccdg_broad_cvd_af_olesen_arrays
Processing snapshot_id: f875fdc4-f57d-4a4c-9b22-daf101156d26...
	Snapshot PHS_ID: 
	Source Workspace: anvil_ccdg_broad_cvd_af_olesen_wes
Processing snapshot_id: 27068295-b3c0-4260-9447-9ca96814d46f...
	Snapshot PHS_ID: phs002243
	Source Workspace: anvil_ccdg_broad_cvd_af_pegasus_hmb
Processing snapshot_id: 2e6dce09-c48f-4aa4-8d76-a4c8bb53b4c7...
	Snapshot PHS_ID: phs001725
	Source Workspace: AnVIL_CCDG_Broad_CVD_AF_Rienstra_WES
Processing snapshot_id: e588585c-4e81-422c-9058-746203958824...
	Snapshot PHS_ID: 
	Source Workspace: anvil_ccdg_broad_cvd_af_roberts_uwo_wes
Processing snapshot_id: ff2e3cd4-44c4-4068-9e57-3023a3e533c7...
	Snapshot PHS_ID: phs002242
	Source Workspace: AnVIL_CCDG_Broad_CVD_AF_

Processing snapshot_id: 44373227-8b15-4524-9ecd-57592c52a6f5...
	Snapshot PHS_ID: 
	Source Workspace: AnVIL_CCDG_Broad_NP_Epilepsy_BELULB_DS-EP-NPU_GSA-MD
Processing snapshot_id: 6e293720-2935-467c-b5fc-0f257eb1fb68...
	Snapshot PHS_ID: phs001489
	Source Workspace: AnVIL_CCDG_Broad_NP_Epilepsy_BELULB_DS-EP-NPU_WES
Processing snapshot_id: e3823ffe-3070-47b2-a0fc-7c0138e6c61a...
	Snapshot PHS_ID: phs001489
	Source Workspace: AnVIL_CCDG_Broad_NP_Epilepsy_BRAUSP_DS_MDS_NPU_GSA-MD
Processing snapshot_id: ab1d91ba-6aa6-4a40-8c10-2a979cfb29ab...
	Snapshot PHS_ID: 
	Source Workspace: AnVIL_CCDG_Broad_NP_Epilepsy_CANCAL_GSA-MD
Processing snapshot_id: 1bb35cec-4174-445b-a646-ff707abc2fd9...
	Snapshot PHS_ID: 
	Source Workspace: AnVIL_CCDG_Broad_NP_Epilepsy_CANUTN_DS-EP_GSA-MD
Processing snapshot_id: a632ce72-5246-48fa-a140-f97ee6e9d9b2...
	Snapshot PHS_ID: phs001489
	Source Workspace: AnVIL_CCDG_Broad_NP_Epilepsy_CANUTN_DS-EP_WES
Processing snapshot_id: f87a7821-288b-4bc3-93a2-94ae34604540...
	S

	Snapshot PHS_ID: 
	Source Workspace: AnVIL_CCDG_Broad_NP_Epilepsy_ITAUBG_DS-EPI-NPU-MDS_GSA-MD
Processing snapshot_id: b84e9146-b4d8-4685-b9b1-541b2da269bc...
	Snapshot PHS_ID: phs001489
	Source Workspace: AnVIL_CCDG_Broad_NP_Epilepsy_ITAUBG_DS_EPI_NPU_MDS_WES
Processing snapshot_id: f61d6193-468f-45ed-bf0a-75a5662871bd...
	Snapshot PHS_ID: 
	Source Workspace: AnVIL_CCDG_Broad_NP_Epilepsy_ITAUMC_DS_NEURO_MDS_GSA-MD
Processing snapshot_id: be47d532-a9d8-4a86-bf58-00b2920dd320...
	Snapshot PHS_ID: phs001489
	Source Workspace: AnVIL_CCDG_Broad_NP_Epilepsy_ITAUMC_DS_NEURO_MDS_WES
Processing snapshot_id: 5630c567-3752-4fa2-8124-c1b8bca37aca...
	Snapshot PHS_ID: 
	Source Workspace: AnVIL_CCDG_Broad_NP_Epilepsy_ITAUMR_GRU-NPU_GSA-MD
Processing snapshot_id: 52f35032-2afa-4722-983e-e88c8cb808ac...
	Snapshot PHS_ID: phs001489
	Source Workspace: AnVIL_CCDG_Broad_NP_Epilepsy_ITAUMR_GRU_NPU_WES
Processing snapshot_id: 51865c0a-9548-4fea-a6e5-c8754a0bb085...
	Snapshot PHS_ID: 
	Source Workspace: An

Processing snapshot_id: 636272e4-d4e2-4a25-ba10-e1d1cb9352bb...
	Snapshot PHS_ID: 
	Source Workspace: AnVIL_CCDG_Broad_NP_Epilepsy_USAMGH_MGBB_HMB_MDS_WES
Processing snapshot_id: 0acc57c8-5c8e-44ab-bab5-6dcb7b6465ef...
	Snapshot PHS_ID: 
	Source Workspace: AnVIL_CCDG_Broad_NP_Epilepsy_USAMON_GRU_GSA-MD
Processing snapshot_id: 63a987d4-bfca-46c4-bbbc-afdbf357308a...
	Snapshot PHS_ID: 
	Source Workspace: AnVIL_CCDG_Broad_NP_Epilepsy_USAMON_GRU_GSRS_GSA-MD
Processing snapshot_id: 538662ce-44d2-4fb9-ab3d-f02342d26761...
	Snapshot PHS_ID: 
	Source Workspace: AnVIL_CCDG_Broad_NP_Epilepsy_USAMON_GRU_GSRS_WES
Processing snapshot_id: f58f9cc2-70e0-40fd-8adc-674adc503f8f...
	Snapshot PHS_ID: 
	Source Workspace: AnVIL_CCDG_Broad_NP_Epilepsy_USAMON_GRU_NPU_GSA-MD
Processing snapshot_id: ffc3e5ac-95e6-464d-92dc-1c4fd1ca394e...
	Snapshot PHS_ID: 
	Source Workspace: AnVIL_CCDG_Broad_NP_Epilepsy_USAMON_GRU_NPU_WES
Processing snapshot_id: 7406b139-dae3-499e-94f5-b762fee73bd7...
	Snapshot PHS_ID: 
	Sour

	Snapshot PHS_ID: 
	Source Workspace: AnVIL_CCDG_WASHU_PAGE
Processing snapshot_id: 9cf61d88-d096-4981-b0c6-99db77554c01...
	Snapshot PHS_ID: phs001222
	Source Workspace: anvil_ccdg_washu_ai_t1d_t1dgc_wgs
Processing snapshot_id: 4c722626-c559-4f5a-84bd-8d7d46983e1e...
	Snapshot PHS_ID: phs002163
	Source Workspace: anvil_ccdg_washu_cvd-np-ai_controls_vccontrols_wgs
Processing snapshot_id: 7c237e08-3329-4e64-bd2a-063be290e78b...
	Snapshot PHS_ID: 
	Source Workspace: anvil_ccdg_washu_cvd_brazil-cvd_wgs
Processing snapshot_id: 4117144f-92e7-454f-9263-dad5e128cadb...
	Snapshot PHS_ID: 
	Source Workspace: AnVIL_CCDG_WashU_CVD_Corogene_WGS
Processing snapshot_id: ce2e7235-26e6-470f-8e05-298193b7f53d...
	Snapshot PHS_ID: phs002325
	Source Workspace: AnVIL_CCDG_WashU_CVD_EOCAD_BioImage_WGS
Processing snapshot_id: 6df525e1-b143-4e6f-b667-80c783ae1b66...
	Snapshot PHS_ID: phs001155
	Source Workspace: anvil_ccdg_washu_cvd_eocad_biome_wgs
Processing snapshot_id: 92666b7c-4d50-4530-88e9-ea2d3da9d07a

Processing snapshot_id: f06adf86-4526-47a8-b59a-2bf137e034d2...
	Snapshot PHS_ID: phs001272
	Source Workspace: AnVIL_CMG_Broad_Orphan_Chung_WES
Processing snapshot_id: bbd04481-0b9d-4c21-ba65-a43638116e0f...
	Snapshot PHS_ID: phs001272
	Source Workspace: AnVIL_CMG_Broad_Orphan_Estonia-Ounap_WES
Processing snapshot_id: dbdfebae-3eb0-4fc5-b744-eb901da3591c...
	Snapshot PHS_ID: 
	Source Workspace: AnVIL_CMG_Broad_Orphan_Estonia-Ounap_WGS
Processing snapshot_id: 29fa069b-8df8-4fb3-bfa8-01e0504d050c...
	Snapshot PHS_ID: phs001272
	Source Workspace: AnVIL_CMG_Broad_Orphan_Lerner-Ellis_WES
Processing snapshot_id: 2b78a3ac-8bca-4938-bc7c-26a60f9c04ac...
	Snapshot PHS_ID: phs001272
	Source Workspace: AnVIL_CMG_Broad_Orphan_Manton_WES
Processing snapshot_id: 4bb891fc-fcae-40cc-bf59-73716de7e04e...
	Snapshot PHS_ID: phs001272
	Source Workspace: AnVIL_CMG_Broad_Orphan_Manton_WGS
Processing snapshot_id: 508b9f8a-c827-4dc0-8319-6aeb90482bdf...
	Snapshot PHS_ID: phs001272
	Source Workspace: AnVIL_CMG

	Snapshot PHS_ID: phs000298
	Source Workspace: AnVIL_ccdg_asc_ndd_daly_talkowski_goethe_asd_exome
Processing snapshot_id: 3838993f-59ba-4dec-8110-ac3ea387ab91...
	Snapshot PHS_ID: 
	Source Workspace: AnVIL_ccdg_asc_ndd_daly_talkowski_herman_asd_exome
Processing snapshot_id: bf2f4106-cee9-419c-b4d1-d7b03a6293d5...
	Snapshot PHS_ID: phs000298
	Source Workspace: AnVIL_ccdg_asc_ndd_daly_talkowski_hertz-picciotto_asd_exome
Processing snapshot_id: a6c36f5e-b86c-4164-85ae-8bf0df2e4a90...
	Snapshot PHS_ID: phs000298
	Source Workspace: AnVIL_ccdg_asc_ndd_daly_talkowski_hertz-picciotto_asd_wgs
Processing snapshot_id: 5e547934-c339-410e-a013-dfefed50f4b8...
	Snapshot PHS_ID: phs000298
	Source Workspace: AnVIL_ccdg_asc_ndd_daly_talkowski_kolevzon_asd_exome
Processing snapshot_id: ffa84feb-ca0e-43d3-a04d-a402a8e24a3b...
	Snapshot PHS_ID: phs000298
	Source Workspace: AnVIL_ccdg_asc_ndd_daly_talkowski_kolevzon_asd_wgs
Processing snapshot_id: 2be072bd-2153-4050-9358-e4b95297a9bf...
	Snapshot PHS_ID: 


	Snapshot PHS_ID: 
	Source Workspace: anvil_ccdg_broad_ai_ibd_daly_mccauley_wes
Processing snapshot_id: c890024c-40ed-42db-ae45-b119d038461e...
	Snapshot PHS_ID: phs001642
	Source Workspace: anvil_ccdg_broad_ai_ibd_daly_mcgovern_gsa
Processing snapshot_id: cc033b0a-6285-426b-8d6c-f29739b62920...
	Snapshot PHS_ID: phs001642
	Source Workspace: anvil_ccdg_broad_ai_ibd_daly_mcgovern_niddk_wes
Processing snapshot_id: 4022a967-0753-4f74-a682-b980528c112d...
	Snapshot PHS_ID: phs001642
	Source Workspace: anvil_ccdg_broad_ai_ibd_daly_mcgovern_share_wes
Processing snapshot_id: 68af6886-c7de-4a2f-abde-0314a301ac1a...
	Snapshot PHS_ID: 
	Source Workspace: anvil_ccdg_broad_ai_ibd_daly_moayyedi_imagine_gsa
Processing snapshot_id: ba1a01e5-23e3-417a-a45d-91368dce617c...
	Snapshot PHS_ID: phs001642
	Source Workspace: anvil_ccdg_broad_ai_ibd_daly_moayyedi_imagine_wes
Processing snapshot_id: 46bb697c-4b2c-4ae2-90d8-4fed2a00f831...
	Snapshot PHS_ID: phs001642
	Source Workspace: anvil_ccdg_broad_ai_ibd_d

snapshot_id,studyName,studyType,studyDescription,dataTypes,phenotypeIndication,species,piName,dataCustodianEmail,publicVisibility,nihAnvilUse,submittingToAnvil,dbGaPPhsID,dbGaPStudyRegistrationName,embargoReleaseDate,sequencingCenter,piEmail,piInstitution,nihGrantContractNumber,nihICsSupportingStudy,nihProgramOfficerName,nihInstitutionCenterSubmission,nihInstitutionalCertificationFileName,nihGenomicProgramAdministratorName,multiCenterStudy,collaboratingSites,controlledAccessRequiredForGenomicSummaryResultsGSR,controlledAccessRequiredForGenomicSummaryResultsGSRRequiredExplanation,alternativeDataSharingPlan,alternativeDataSharingPlanReasons,alternativeDataSharingPlanExplanation,alternativeDataSharingPlanFileName,alternativeDataSharingPlanDataSubmitted,alternativeDataSharingPlanDataReleased,alternativeDataSharingPlanTargetDeliveryDate,alternativeDataSharingPlanTargetPublicReleaseDate,alternativeDataSharingPlanAccessManagement,consentGroups.consentGroupName,consentGroups.accessManagement,consentGroups.numberOfParticipants,consentCode,consentGroups.generalResearchUse,consentGroups.hmb,consentGroups.diseaseSpecificUse,consentGroups.gs,consentGroups.poa,consentGroups.nmds,consentGroups.gso,consentGroups.pub,consentGroups.col,consentGroups.irb,consentGroups.npu,consentGroups.otherPrimary,consentGroups.otherSecondary,consentGroups.mor,consentGroups.morDate,consentGroups.dataLocation,consentGroups.url,consentGroups.fileTypes.fileType,consentGroups.fileTypes.functionalEquivalence,consortium,unique_value_validation,study_enum_value_validation,consent_group_enum_value_validation
5184edeb-81f8-406b-926a-64604090904e,1000Genomes,Parent-Offspring Trios,"Data Release notes for the workspace 1000 Genomes 3202 phase 3, Update May 23, 2023 From March 8, 2023 to May 8, 2023, the following files below were temporarily unavailable. As of May 8, 2023, these files have been restored. HG00257.final.cram HG00369.final.cram HG00657.final.cram HG00684.final.cram HG01079.final.cram HG01110.final.cram HG01122.final.cram HG01491.final.cram HG01697.final.cram HG01777.final.cram HG01976.final.cram HG02164.final.cram HG02312.final.cram HG02380.final.cram HG02512.final.cram HG02557.final.cram HG02681.final.cram HG02769.final.cram HG03063.final.cram HG03563.final.cram HG03844.final.cram HG03898.final.cram HG03937.final.cram HG04141.final.cram NA18912.final.cram NA19060.final.cram NA19238.final.cram NA19360.final.cram NA19474.final.cram NA20790.final.cram NA21126.final.cram HG00408.final.cram HG00639.final.cram HG01096.final.cram HG01529.final.cram HG02821.final.cram HG03047.final.cram HG03487.final.cram NA18487.final.cram NA18500.final.cram NA20279.final.cram HG01141.haplotypeCalls.er.raw.vcf.gz NA10830.haplotypeCalls.er.raw.vcf.gz NA18863.haplotypeCalls.er.raw.vcf.gz 1000 Genomes 3202 phase 3 panel samples sequenced to high coverage ================================================================== This policy refers to 30x Illumina NovaSeq sequencing of 3202 samples from the 1000 Genomes project phase 3 sample set, along with 698 family members that complete trios in the phase 3 data . These data were generated at the New York Genome Center with funds provided by NHGRI Grant 3UM1HG008901. Please email service@nygenome.org with questions or interest in undertaking collaborative analysis of this dataset. All cell lines were obtained from the Coriell Institute for Medical Research and were consented for full public release of genomic data. Please see Coriell (https://www.coriell.org) for more information about specific cell lines. The following cell lines/DNA samples were obtained from the NIGMS Human Genetic Cell Repository at the Coriell Institute for Medical Research: [NA06984, NA06985, NA06986, NA06989, NA06994, NA07000, NA07037, NA07048, NA07051, NA07056, NA07347, NA07357, NA10847, NA10851, NA11829, NA11830, NA11831, NA11832, NA11840, NA11843, NA11881, NA11892, NA11893, NA11894, NA11918, NA11919, NA11920, NA11930. NA11931, NA11932, NA11933, NA11992, NA11994, NA11995, NA12003, NA12004, NA12005, NA12006, NA12043, NA12044, NA12045, NA12046, NA12058, NA12144, NA12154, NA12155, NA12156, NA12234, NA12249, NA12272, NA12273, NA12275, NA12282, NA12283, NA12286, NA12287, NA12340, NA12341, NA12342, NA12347, NA12348, NA12383, NA12399, NA12400, NA12413,, NA12414, NA12489, NA12546, NA12716, NA12717, NA12718, NA12748, NA12749, NA12750, NA12751, NA12760, NA12761, NA12762, NA12763, NA12775, NA12776, NA12777, NA12778, NA12812, NA12813, NA12814, NA12815, NA12827, NA12828, NA12829, NA12830, NA12842, NA12843, NA12872, NA12873, NA12874, NA12878, NA12889, NA12890, NA12376, NA10838, NA12329, NA10852, NA10840, NA12386, NA12864, NA12801, NA12344, NA10861, NA07029, NA12753, NA12832, NA12485, NA12802, NA12739, NA10856, NA10845, NA12818, NA10831, NA12766, NA10864, NA10843, NA12877, NA12335, NA12817, NA12752, NA12767, NA10855, NA12707, NA10857, NA10839, NA12740, NA10837, NA10836, NA07348, NA11993, NA12057, NA11839, NA06993, NA07014, NA06995, NA12146, NA12865, NA10859, NA06991, NA12336, NA10860, NA12145, NA07045, NA07349, NA07031, NA07345, NA12891, NA07055, NA07435, NA10835, NA12274, NA12875, NA10842, NA12239, NA10830, NA12056, NA11917, NA12892, NA06997, NA07022, NA12264, NA11891, NA07034, NA12248, NA10865, , NA10863, NA10854, NA11882, NA07346, NA07019, NA12343, NA10846]. If using this data please acknowledge that: â€œThese data were generated at the New York Genome Center with funds provided by NHGRI Grant 3UM1HG008901-03S1.â€ Additionally, if data from any of the cell lines listed were used, then they should be cited specifically as coming from NIGMS. For more general information about the consent for the samples in this data set, please see http://www.internationalgenome.org/about#g1k_data_reuse. For general enquiries, please contact info@1000genomes.org. ================================================================== 1000 Genomes Processing README This README contains information relating to data associated with the 1000 Genomes resequencing done at New York Genome Center. Alignment, post-processing and variant calling Alignment and post-processing are performed exactly as outlined by the Center for Common Disease Genomics project: https://github.com/CCDG/PipelineStandardization/blob/master/PipelineStandard.md . Programs and reference data The data was aligned to the reference genome using the following programs and reference datasets: 1. BWA-MEM bwakit-0.7.15 2. Samtools-1.3.1 3. Picard-2.4.1 4. GATK-3.5-0 5. Resource files – All the resource files used in the analysis can be obtained here: https://console.cloud.google.com/storage/browser/genomics-publicdata/resources/broad/hg38/v0/ . Reference genome: GRCh38 with alternative sequences, plus decoys and HLA The reference genome that the data was aligned to can be obtained here: ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/GRCh38_reference_genome /GRCh38_full_analysis_set_plus_decoy_hla.fa ================================================================== Analysis work is being done by a number of groups, working toward variant calling, including identification of structural variation. Initial analysis has been done by NYGC, including aligning the data to GRCh38, creating the CRAMs in ENA. The document [NYGC_b38_pipeline_description.pdf](http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000G_2504_high_cov/NYGC_b38_pipeline_description.pdf) contains a description of that analysis work and details of the alignment pipeline. Should you have questions about this data please contact info@1000genomes.org Platform: AnVIL","['Raw Sequencing data', 'VCF']",,Human,AnVIL Team,['help@lists.anvilproject.org'],True,I am NHGRI funded and I do not have a dbGaP PHS ID,True,,,,,,,,,,NHGRI,,,,['1000G'],,,False,[],,,,,,,,ANVIL_1000G_high_coverage_2019_20230517_ANV5_202305181946,open,3202,NRES,False,False,[],,False,False,False,False,False,False,False,,,,,TDR Location,https://data.terra.bio/snapshots/5184edeb-81f8-406b-926a-64604090904e,['Whole Genome'],,1000G,Pass,Fail,Fail
dd2b61fb-d420-4a38-9cd2-8464f51d7617,A New Reference Panel to Boost African American Genotype Imputation (phs001798),Cross-Sectional,"Modern genetic studies have been conducted predominantly in cohorts of individuals of European ancestry. By 2010, there were approximately ten times as many published genome wide association studies (GWAS) in people of European ancestry than studies in people of all other ancestries combined. This research disparity has led to an uneven understanding of the genetic basis underlying disease in Europeans and non-Europeans. 23andMe's web-based, large scale research model is ideal for scaling genetics research within non-European populations and thereby bringing more parity to genetics research. Our database is composed of genotypes and phenotypes of over 1,000,000 consenting customers, including over 200,000 individuals with non-European ancestry. The data derived from non-European individuals represent a particularly valuable resource for genetic discovery of novel variants that may not be found in the European population. However, research studies in non-European populations are weakened by the lack of availability of large-scale reference datasets and, in particular, genotype imputation panels. Genotype imputation is a statistical methodology that uses observations of genotypes in a large reference panel to infer unobserved genotypes in a target dataset. This methodology is widely used within GWAS, and allows novel genetic associations to be identified and refined. Due to this utility, very large reference panels have been constructed, containing thousands or tens of thousands of whole genome sequences. Unfortunately, the largest imputation panels are composed of predominantly European genomes, reflecting the modern bias towards European studies in GWAS. This proposal aims to address this imbalance by constructing an imputation panel specifically for the African American population. In doing so, we will expand 23andMe's ability to perform genetic discovery in non-European populations, and improve the understanding of global genetic variation underlying diseases and traits. Key commercial outcomes of the research include the identification of novel genetic targets for internal and external therapeutic development. The long-term aim is to improve understanding of disease in minority populations, which we hope may eventually lead to improved treatments of disease in these historically medically understudied groups. Platform: AnVIL","['SNP/CNV Genotypes (NGS)', 'WGS']",Black or African American,Human,Adam Auton,['help@lists.anvilproject.org'],True,I am NHGRI funded and I have a dbGaP PHS ID already,True,phs001798,A New Reference Panel to Boost African American Genotype Imputation,,,,"23andMe, Inc., Mountain View, CA, USA",,['NHGRI'],,NHGRI,,"Strasburger, Jennifer",,[],,,False,[],,,,,,,,ANVIL_African_American_Seq_HGV_20230727_ANV5_202308291753,controlled,1,HGV,False,False,[],,False,False,False,False,False,False,False,HGV,,,,TDR Location,https://data.terra.bio/snapshots/dd2b61fb-d420-4a38-9cd2-8464f51d7617,"['SNP/CNV Genotypes (NGS), Whole Genome']",,,Pass,Fail,Fail
5fb3cd44-691f-41ef-a009-5a401b5fcae5,AnVIL CMG,Parent-Offspring Trios,Rare muscular disease samples submitted to the Broad Center for Mendelian Genomics by the Kids Neuroscience Centre of the Sydney Children's Hospitals Network. Cohort consists of WGS samples aligned to hg38. Platform: AnVIL,['Raw Sequencing data'],Mendelian disorders,Human,,['help@lists.anvilproject.org'],True,I am NHGRI funded and I do not have a dbGaP PHS ID,True,,,,,,,,,,NHGRI,,,,['CMG'],,,False,[],,,,,,,,ANVIL_CMG_Broad_Muscle_KNC_WGS_20221117_ANV5_202304242221,controlled,6,DS-NIC-EMP-LENF,False,False,['DS-NIC-EMP-LENF'],,False,False,False,False,False,False,False,,,,,TDR Location,https://data.terra.bio/snapshots/5fb3cd44-691f-41ef-a009-5a401b5fcae5,['Whole Genome'],,CMG,Pass,Fail,Fail
ba915a8d-24d0-4a94-9220-4f1d058521a1,AnVIL_CCDG_Broad_CVD_AF_Ellinor_MGH_WES,,AnVIL_CCDG_Broad_CVD_AF_Ellinor_MGH_WES Platform: AnVIL,['Raw Sequencing data'],atrial fibrillation,Human,,['help@lists.anvilproject.org'],True,I am NHGRI funded and I do not have a dbGaP PHS ID,True,,,,,,,,,,NHGRI,,,,['CCDG'],,,False,[],,,,,,,,ANVIL_CCDG_Broad_CVD_AF_Ellinor_MGH_WES_20221117_ANV5_202304271354,controlled,489,,False,False,[],,False,False,False,False,False,False,False,,,,,TDR Location,https://data.terra.bio/snapshots/ba915a8d-24d0-4a94-9220-4f1d058521a1,['Exome'],,CCDG,Pass,Pass,Pass
50a37ecf-071a-4f8f-9c72-70280973f9eb,AnVIL_CCDG_Broad_NP_Epilepsy_AUSAUS_EPIL_BA_MDS_GSA-MD,Case-Control,Platform: AnVIL,['Genotyping Array data'],epilepsy,Human,Benjamin Neale,['help@lists.anvilproject.org'],True,I am NHGRI funded and I do not have a dbGaP PHS ID,True,,,,,,,,,,NHGRI,,,,['CCDG'],,,False,[],,,,,,,,ANVIL_CCDG_Broad_NP_Epilepsy_AUSAUS_EPIL_BA_MDS_GSA_MD_20221117_ANV5_202304271400,controlled,12,,False,False,[],,False,False,False,False,False,False,False,,,,,TDR Location,https://data.terra.bio/snapshots/50a37ecf-071a-4f8f-9c72-70280973f9eb,['Genotyping Array'],,CCDG,Pass,Fail,Fail
895f4ecd-fdda-4e85-8fee-be0721b74184,AnVIL_CCDG_Broad_NP_Epilepsy_AUSAUS_EPI_BA_ID_MDS_GSA-MD,,Platform: AnVIL,['Genotyping Array data'],epilepsy,Human,,['help@lists.anvilproject.org'],True,I am NHGRI funded and I do not have a dbGaP PHS ID,True,,,,,,,,,,NHGRI,,,,['CCDG'],,,False,[],,,,,,,,ANVIL_CCDG_Broad_NP_Epilepsy_AUSAUS_EPI_BA_ID_MDS_GSA_MD_20221117_ANV5_202304271358,controlled,25,,False,False,[],,False,False,False,False,False,False,False,,,,,TDR Location,https://data.terra.bio/snapshots/895f4ecd-fdda-4e85-8fee-be0721b74184,['Genotyping Array'],,CCDG,Pass,Pass,Fail
e9dcabec-7cc0-482b-83a5-f596e7a98db0,AnVIL_CCDG_Broad_NP_Epilepsy_AUSAUS_EP_BA_CN_ID_MDS_GSA-MD,,Platform: AnVIL,['Genotyping Array data'],epilepsy,Human,,['help@lists.anvilproject.org'],True,I am NHGRI funded and I do not have a dbGaP PHS ID,True,,,,,,,,,,NHGRI,,,,['CCDG'],,,False,[],,,,,,,,ANVIL_CCDG_Broad_NP_Epilepsy_AUSAUS_EP_BA_CN_ID_MDS_GSA_MD_20221117_ANV5_202304271356,controlled,1520,,False,False,[],,False,False,False,False,False,False,False,,,,,TDR Location,https://data.terra.bio/snapshots/e9dcabec-7cc0-482b-83a5-f596e7a98db0,['Genotyping Array'],,CCDG,Pass,Pass,Fail
5cd2e542-1090-4dfb-a7a5-b276b32e58dc,AnVIL_CCDG_Broad_NP_Epilepsy_AUSRMB_DS-EAED-MDS_GSA-MD,Case-Control,Platform: AnVIL,['Genotyping Array data'],epilepsy,Human,Benjamin Neale,['help@lists.anvilproject.org'],True,I am NHGRI funded and I do not have a dbGaP PHS ID,True,,,,,,,,,,NHGRI,,,,['CCDG'],,,False,[],,,,,,,,ANVIL_CCDG_Broad_NP_Epilepsy_AUSRMB_DS_EAED_MDS_GSA_MD_20221117_ANV5_202304271401,controlled,388,,False,False,[],,False,False,False,False,False,False,False,,,,,TDR Location,https://data.terra.bio/snapshots/5cd2e542-1090-4dfb-a7a5-b276b32e58dc,['Genotyping Array'],,CCDG,Pass,Fail,Fail
3c672fd0-d723-49f4-b2c6-d24d2658a049,AnVIL_CCDG_Broad_NP_Epilepsy_BELATW_GRU_GSA-MD,,Platform: AnVIL,['Genotyping Array data'],epilepsy,Human,,['help@lists.anvilproject.org'],True,I am NHGRI funded and I do not have a dbGaP PHS ID,True,,,,,,,,,,NHGRI,,,,['CCDG'],,,False,[],,,,,,,,ANVIL_CCDG_Broad_NP_Epilepsy_BELATW_GRU_GSA_MD_20221117_ANV5_202304271403,controlled,231,,False,False,[],,False,False,False,False,False,False,False,,,,,TDR Location,https://data.terra.bio/snapshots/3c672fd0-d723-49f4-b2c6-d24d2658a049,['Genotyping Array'],,CCDG,Pass,Pass,Fail
44373227-8b15-4524-9ecd-57592c52a6f5,AnVIL_CCDG_Broad_NP_Epilepsy_BELULB_DS-EP-NPU_GSA-MD,Case-Control,Platform: AnVIL,['Genotyping Array data'],epilepsy,Human,Ben Neale,['help@lists.anvilproject.org'],True,I am NHGRI funded and I do not have a dbGaP PHS ID,True,,,,,,,,,,NHGRI,,,,['CCDG'],,,False,[],,,,,,,,ANVIL_CCDG_Broad_NP_Epilepsy_BELULB_DS_EP_NPU_GSA_MD_20230118_ANV5_202304271404,controlled,422,,False,False,[],,False,False,False,False,False,False,False,,,,,TDR Location,https://data.terra.bio/snapshots/44373227-8b15-4524-9ecd-57592c52a6f5,['Genotyping Array'],,CCDG,Pass,Fail,Fail




Unique Study Value Validation Results:


studyName,studyType,studyDescription,dataTypes,phenotypeIndication,species,piName,dataCustodianEmail,publicVisibility,nihAnvilUse,submittingToAnvil,dbGaPPhsID,dbGaPStudyRegistrationName,embargoReleaseDate,sequencingCenter,piEmail,piInstitution,nihGrantContractNumber,nihICsSupportingStudy,nihProgramOfficerName,nihInstitutionCenterSubmission,nihInstitutionalCertificationFileName,nihGenomicProgramAdministratorName,multiCenterStudy,collaboratingSites,controlledAccessRequiredForGenomicSummaryResultsGSR,controlledAccessRequiredForGenomicSummaryResultsGSRRequiredExplanation,alternativeDataSharingPlan,alternativeDataSharingPlanReasons,alternativeDataSharingPlanExplanation,alternativeDataSharingPlanFileName,alternativeDataSharingPlanDataSubmitted,alternativeDataSharingPlanDataReleased,alternativeDataSharingPlanTargetDeliveryDate,alternativeDataSharingPlanTargetPublicReleaseDate,alternativeDataSharingPlanAccessManagement,unique_value_validation
1000Genomes,1,1,1,0,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,1,0,0,0,0,0,0,0,Pass
A New Reference Panel to Boost African American Genotype Imputation (phs001798),1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,1,0,1,0,1,0,1,0,1,0,0,1,1,0,0,0,0,0,0,0,Pass
AnVIL CMG,1,1,1,1,1,0,1,1,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,1,0,0,0,0,0,0,0,Pass
AnVIL_CCDG_Broad_CVD_AF_Ellinor_MGH_WES,0,1,1,1,1,0,1,1,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,1,0,0,0,0,0,0,0,Pass
AnVIL_CCDG_Broad_NP_Epilepsy_AUSAUS_EPIL_BA_MDS_GSA-MD,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,1,0,0,0,0,0,0,0,Pass
AnVIL_CCDG_Broad_NP_Epilepsy_AUSAUS_EPI_BA_ID_MDS_GSA-MD,0,1,1,1,1,0,1,1,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,1,0,0,0,0,0,0,0,Pass
AnVIL_CCDG_Broad_NP_Epilepsy_AUSAUS_EP_BA_CN_ID_MDS_GSA-MD,0,1,1,1,1,0,1,1,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,1,0,0,0,0,0,0,0,Pass
AnVIL_CCDG_Broad_NP_Epilepsy_AUSRMB_DS-EAED-MDS_GSA-MD,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,1,0,0,0,0,0,0,0,Pass
AnVIL_CCDG_Broad_NP_Epilepsy_BELATW_GRU_GSA-MD,0,1,1,1,1,0,1,1,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,1,0,0,0,0,0,0,0,Pass
AnVIL_CCDG_Broad_NP_Epilepsy_BELULB_DS-EP-NPU_GSA-MD,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,1,0,0,0,0,0,0,0,Pass




Study Enum Value Validation Results:


studyName,studyType,nihInstitutionCenterSubmission,nihICsSupportingStudy,study_enum_value_validation
1000Genomes,1,0,0,Fail
A New Reference Panel to Boost African American Genotype Imputation (phs001798),1,0,0,Fail
AnVIL CMG,1,0,0,Fail
AnVIL_CCDG_Broad_CVD_AF_Ellinor_MGH_WES,0,0,0,Pass
AnVIL_CCDG_Broad_NP_Epilepsy_AUSAUS_EPIL_BA_MDS_GSA-MD,1,0,0,Fail
AnVIL_CCDG_Broad_NP_Epilepsy_AUSAUS_EPI_BA_ID_MDS_GSA-MD,0,0,0,Pass
AnVIL_CCDG_Broad_NP_Epilepsy_AUSAUS_EP_BA_CN_ID_MDS_GSA-MD,0,0,0,Pass
AnVIL_CCDG_Broad_NP_Epilepsy_AUSRMB_DS-EAED-MDS_GSA-MD,1,0,0,Fail
AnVIL_CCDG_Broad_NP_Epilepsy_BELATW_GRU_GSA-MD,0,0,0,Pass
AnVIL_CCDG_Broad_NP_Epilepsy_BELULB_DS-EP-NPU_GSA-MD,1,0,0,Fail




Consent Group Enum Value Validation Results:


consentGroups.consentGroupName,consentGroups.fileTypes.fileType,consent_group_enum_value_validation
ANVIL_1000G_high_coverage_2019_20230517_ANV5_202305181946,1,Fail
ANVIL_African_American_Seq_HGV_20230727_ANV5_202308291753,1,Fail
ANVIL_CCDG_Baylor_CVD_AFib_BioVU_WGS_20221110_ANV5_202304241855,1,Fail
ANVIL_CCDG_Baylor_CVD_AFib_Groningen_WGS_20221122_ANV5_202304242224,1,Fail
ANVIL_CCDG_Baylor_CVD_AFib_VAFAR_HMB_IRB_WGS_20221020_ANV5_202304211525,1,Fail
ANVIL_CCDG_Baylor_CVD_ARIC_20231008_ANV5_202312122036,1,Fail
ANVIL_CCDG_Baylor_CVD_EOCAD_BioMe_WGS_20221122_ANV5_202304242226,1,Fail
ANVIL_CCDG_Baylor_CVD_EOCAD_SoL_WGS_20230418_ANV5_202312122046,1,Fail
ANVIL_CCDG_Baylor_CVD_HHRC_Brownsville_GRU_WGS_20221122_ANV5_202304242228,1,Fail
ANVIL_CCDG_Baylor_CVD_HemStroke_BNI_HMB_WGS_20221215_ANV5_202304242306,1,Fail


# Step 2: Load Reviewed Metadata into DUOS

In [110]:
#############################################
## Functions
#############################################

def format_list(input_list, min_items):
    if input_list:
        if isinstance(input_list, list):
            return input_list
        elif isinstance(input_list, str):
            return format_list(ast.literal_eval(input_list), min_items)
        else:
            return []
    else:
        if min_items > 0:
            i = 0
            temp_list = []
            while i < min_items:
                temp_list.append("Unknown")
                i += 1
            return temp_list
        else:
            return []
    
def format_file_types(ft_list, fe):
    if ft_list:
        output_list = []
        formatted_ft_list = format_list(ft_list, 0)
        for ft in formatted_ft_list:
            ft_dict = {"fileType": ft}
            if fe:
                ft_dict["functionalEquivalence"] = fe
            else:
                ft_dict["functionalEquivalence"] = "Unknown"
            output_list.append(ft_dict)
        return output_list
    else:
        return []
    
def upload_to_duos(input_file, token):
    
    # Pull down specified file from the cloud
    results_log = []
    print(f"Downloading input file {input_file}...")
    try:
        input_df = pd.read_csv(input_file_gcs_path, delimiter = "\t", encoding='unicode_escape')
        input_df = input_df.astype(object).where(pd.notnull(input_df),None)
        input_df.fillna("",inplace=True)
        input_dict = input_df.to_dict(orient="records")
        results_log.append(["Input File Download", "Succeeded", ""])
    except Exception as e:
        msg = f"Error downloading input file ({input_file}): {str(e)}"
        results_log.append(["Input File Download", "Failed", msg])
        print(msg)
        return results_log

    # Pull a list of existing datasets and studies from DUOS and build lookup dicts
    print("Building study and dataset lookup dicts from DUOS...")
    try:
        datasets = requests.get(
            url=f"https://consent.dsde-dev.broadinstitute.org/api/dataset/v2?asCustodian=false",
            headers={"Authorization": f"Bearer {token}"}
        ).json()
        study_lookup = {}
        for dataset_entry in datasets:
            if dataset_entry["study"].get("name"):
                if not study_lookup.get(dataset_entry["study"]["name"]):
                    study_lookup[dataset_entry["study"]["name"]] = dataset_entry["study"]["studyId"]
        dataset_lookup = {}
        for dataset_entry in datasets:
            if dataset_entry.get("name"):
                dataset_lookup[dataset_entry["name"]] = dataset_entry["dataSetId"]
        results_log.append(["DUOS Study and Dataset Lookup Dict Creation", "Succeeded", ""])
    except Exception as e:
        msg = f"Error building study and dataset lookups: {str(e)}"
        results_log.append(["DUOS Study and Dataset Lookup Dict Creation", "Failed", msg])
        print(msg)
        return results_log
    
    # Parse and build DUOS schema for inputted file
    print("Parsing input file and formatting into DUOS schema...")
    try:
        # Determine data submitter id
        response = requests.get(
            url=f"https://consent.dsde-dev.broadinstitute.org/api/user/me",
            headers={"Authorization": f"Bearer {token}"}
        ).json()
        data_submitter_id = response["userId"]
        # Build dictionary for upload
        upload_dict = {}
        for input_entry in input_dict:
            snapshot_id = input_entry["snapshot_id"]
            study_name = input_entry["studyName"]
            consent_group_name = input_entry["consentGroups.consentGroupName"]
            access_type = input_entry["consentGroups.accessManagement"]
            print(f"Parsing and formatting metadata for snapshot {snapshot_id} from the input file. Target study is: {study_name}")
            study_id = study_lookup.get(study_name)
            dataset_id = dataset_lookup.get(consent_group_name)
            if study_id and dataset_id:
                consent_group_dict = {
                            "consentGroupName": consent_group_name,
                            "datasetId": dataset_id,
                            "numberOfParticipants": input_entry["consentGroups.numberOfParticipants"],
                            "dataLocation": input_entry["consentGroups.dataLocation"],
                            "url": input_entry["consentGroups.url"],
                            "fileTypes": []
                            #"fileTypes": format_file_types(input_entry["consentGroups.fileTypes.fileType"], input_entry["consentGroups.fileTypes.functionalEquivalence"]) --> Enumeration, exclude for now
                    }
            elif access_type == "open":
                consent_group_dict = {
                            "consentGroupName": consent_group_name,
                            "accessManagement": access_type,
                            "numberOfParticipants": input_entry["consentGroups.numberOfParticipants"],
                            "dataLocation": input_entry["consentGroups.dataLocation"],
                            "url": input_entry["consentGroups.url"],
                            "fileTypes": []
                            #"fileTypes": format_file_types(input_entry["consentGroups.fileTypes.fileType"], input_entry["consentGroups.fileTypes.functionalEquivalence"]) --> Enumeration, exclude for now
                    }
            else:
                consent_group_dict = {
                            "consentGroupName": consent_group_name,
                            "dataAccessCommitteeId": 3,
                            "accessManagement": access_type,
                            "numberOfParticipants": input_entry["consentGroups.numberOfParticipants"],
                            "generalResearchUse": input_entry["consentGroups.generalResearchUse"],
                            "hmb": input_entry["consentGroups.hmb"],
                            "diseaseSpecificUse": format_list(input_entry["consentGroups.diseaseSpecificUse"], 0),
                            "gs": input_entry["consentGroups.gs"],
                            "poa": input_entry["consentGroups.poa"],
                            "nmds": input_entry["consentGroups.nmds"],
                            "gso": input_entry["consentGroups.gso"],
                            "pub": input_entry["consentGroups.pub"],
                            "col": input_entry["consentGroups.col"],
                            "irb": input_entry["consentGroups.irb"],
                            "npu": input_entry["consentGroups.npu"],
                            #"otherPrimary": input_entry["consentGroups.otherPrimary"], --> Excluding for now, per JL's request
                            #"otherSecondary": input_entry["consentGroups.otherSecondary"], --> Excluding for now, per JL's request
                            #"mor": input_entry["consentGroups.mor"], --> Date formatting validation for morDate, exclude for now
                            #"morDate": input_entry["consentGroups.morDate"], --> Date formatting validation, exclude for now
                            "dataLocation": input_entry["consentGroups.dataLocation"],
                            "url": input_entry["consentGroups.url"],
                            "fileTypes": []
                            #"fileTypes": format_file_types(input_entry["consentGroups.fileTypes.fileType"], input_entry["consentGroups.fileTypes.functionalEquivalence"]) --> Enumeration, exclude for now
                    }
            study_dict = {}
            consent_group_list = []
            if study_name not in upload_dict.keys():
                consent_group_list.append(consent_group_dict)
                study_dict = {
                    "studyName": study_name,
                    #"studyType": input_entry["studyType"], --> Enumeration, exclude for now
                    "studyDescription": input_entry["studyDescription"],
                    "dataTypes": format_list(input_entry["dataTypes"], 1),
                    "phenotypeIndication": input_entry["phenotypeIndication"],
                    "species": input_entry["species"],
                    "piName": input_entry["piName"] if input_entry["piName"] else "NA",
                    "dataSubmitterUserId": data_submitter_id,
                    "dataCustodianEmail": format_list(input_entry["dataCustodianEmail"], 0),
                    "publicVisibility": input_entry["publicVisibility"],
                    "nihAnvilUse": input_entry["nihAnvilUse"],
                    "submittingToAnvil": input_entry["submittingToAnvil"],
                    "dbGaPPhsID": input_entry["dbGaPPhsID"],
                    "dbGaPStudyRegistrationName": input_entry["studyName"],
                    #"embargoReleaseDate": input_entry["embargoReleaseDate"], --> Date formatting validation, exclude for now
                    "sequencingCenter": input_entry["sequencingCenter"],
                    "piEmail": input_entry["piEmail"],
                    #"piInstitution": input_entry["piInstitution"], --> Integer ID for registered institutions, exclude for now
                    "piInstitution": 0,
                    "nihGrantContractNumber": "Unknown", # Required currently
                    "nihICsSupportingStudy": format_list(input_entry["nihICsSupportingStudy"], 0),
                    "nihProgramOfficerName": input_entry["nihProgramOfficerName"],
                    "nihInstitutionCenterSubmission": input_entry["nihInstitutionCenterSubmission"],
                    "nihInstitutionalCertificationFileName": input_entry["nihInstitutionalCertificationFileName"],
                    "nihGenomicProgramAdministratorName": input_entry["nihGenomicProgramAdministratorName"],
                    "collaboratingSites": format_list(input_entry["collaboratingSites"], 0),
                    "alternativeDataSharingPlan": input_entry["alternativeDataSharingPlan"],
                    "consentGroups": consent_group_list
                }
                upload_dict[study_name] = study_dict
            else:
                for consent_group in upload_dict[study_name]["consentGroups"]:
                    if consent_group["consentGroupName"] != consent_group_dict["consentGroupName"]:
                        consent_group_list.append(consent_group)
                consent_group_list.append(consent_group_dict)
                study_dict = {
                    "studyName": study_name,
                    #"studyType": upload_dict[study_name]["studyType"], --> Enumeration, exclude for now
                    "studyDescription": upload_dict[study_name]["studyDescription"],
                    "dataTypes": upload_dict[study_name]["dataTypes"],
                    "phenotypeIndication": upload_dict[study_name]["phenotypeIndication"],
                    "species": upload_dict[study_name]["species"],
                    "piName": upload_dict[study_name]["piName"] if upload_dict[study_name]["piName"] else "NA",
                    "dataSubmitterUserId": upload_dict[study_name]["dataSubmitterUserId"],
                    "dataCustodianEmail": upload_dict[study_name]["dataCustodianEmail"],
                    "publicVisibility": upload_dict[study_name]["publicVisibility"],
                    "nihAnvilUse": upload_dict[study_name]["nihAnvilUse"],
                    "submittingToAnvil": upload_dict[study_name]["submittingToAnvil"],
                    "dbGaPPhsID": upload_dict[study_name]["dbGaPPhsID"],
                    "dbGaPStudyRegistrationName": upload_dict[study_name]["studyName"],
                    #"embargoReleaseDate": upload_dict[study_name]["embargoReleaseDate"], --> Date formatting validation, exclude for now
                    "sequencingCenter": upload_dict[study_name]["sequencingCenter"],
                    "piEmail": upload_dict[study_name]["piEmail"],
                    #"piInstitution": upload_dict[study_name]["piInstitution"], --> Integer ID for registered institutions, exclude for now
                    "piInstitution": upload_dict[study_name]["piInstitution"],
                    "nihGrantContractNumber": upload_dict[study_name]["nihGrantContractNumber"],
                    "nihICsSupportingStudy": upload_dict[study_name]["nihICsSupportingStudy"],
                    "nihProgramOfficerName": upload_dict[study_name]["nihProgramOfficerName"],
                    "nihInstitutionCenterSubmission": upload_dict[study_name]["nihInstitutionCenterSubmission"],
                    "nihInstitutionalCertificationFileName": upload_dict[study_name]["nihInstitutionalCertificationFileName"],
                    "nihGenomicProgramAdministratorName": upload_dict[study_name]["nihGenomicProgramAdministratorName"],
                    "collaboratingSites": upload_dict[study_name]["collaboratingSites"],
                    "alternativeDataSharingPlan": upload_dict[study_name]["alternativeDataSharingPlan"],
                    "consentGroups": consent_group_list
                }
                upload_dict[study_name] = study_dict
        results_log.append(["Input File Formatting", "Succeeded", ""])
    except Exception as e:
        msg = f"Error parsing and formatting input file: {str(e)}"
        results_log.append(["Input File Formatting", "Failed", msg])
        print(msg)
        return results_log
    
    # Loop through studies and dataset to upload
    for study in upload_dict.keys():
        print(f"Uploading data for study {study} into DUOS")
        # For studies that don't exist in DUOS, create a new study
        if not study_lookup.get(study):
            print("Study does NOT currently exist in DUOS. Creating new study and dataset records...")
            try:
                new_study_response = requests.post(
                    url=f"https://consent.dsde-dev.broadinstitute.org/api/dataset/v3",
                    headers={"Authorization": f"Bearer {token}"},
                    files = {
                        "dataset": json.dumps(upload_dict[study]),
                        "alternativeDataSharingPlan": "",
                        "consentGroups[0].nihInstitutionalCertificationFile": ""  
                    }
                ).json()
                if new_study_response.get("studyId"):
                    study_id = new_study_response["studyId"]
                    msg = f"Study registration succeeded! Study Id: {study_id}"
                    results_log.append([f"New Study Registration - {study}", "Succeeded", msg])
                    print(msg)
                else:
                    err_msg = new_study_response["message"]
                    msg = f"Study registration failed: {err_msg}"
                    results_log.append([f"New Study Registration - {study}", "Failed", msg])
                    print(msg)
            except Exception as e:
                msg = f"Study registration failed: {str(e)}"
                results_log.append([f"New Study Registration - {study}", "Failed", msg])
                print(msg)
                
        # For studies that already exist in DUOS, lookup the study ID and update the existing study
        else:
            print("Study currently exists in DUOS. Updating study and dataset records...")
            pass
            try:
                # Update study in DUOS
                study_id = study_lookup.get(study)
                update_study_response = requests.put(
                    url=f"https://consent.dsde-dev.broadinstitute.org/api/dataset/study/{study_id}",
                    headers={"Authorization": f"Bearer {token}"},
                    files = {
                        "dataset": json.dumps(upload_dict[study]),
                        "alternativeDataSharingPlan": "",
                        "consentGroups[0].nihInstitutionalCertificationFile": ""  
                    }
                ).json()   
                if update_study_response.get("studyId"):
                    study_id = update_study_response["studyId"]
                    msg = f"Study registration succeeded! Study Id: {study_id}"
                    results_log.append([f"New Study Registration - {study}", "Succeeded", msg])
                    print(msg)
                else:
                    err_msg = update_study_response["message"]
                    msg = f"Study registration failed: {err_msg}"
                    results_log.append([f"New Study Registration - {study}", "Failed", msg])
                    print(msg)
            except Exception as e:
                msg = f"Study registration failed: {str(e)}"
                results_log.append([f"Study Registration Update - {study}", "Failed", msg])
                print(msg)
    
    # Return results
    return results_log


#############################################
## Input Parameters
#############################################

# Cloud path to file to process
input_file_gcs_path = "gs://fc-2a9eefc3-0302-427f-9ac3-82f078741c03/dataset_metadata/anvil_dataset_metadata_20231214.txt"

# User token (use gcloud auth print-access-token to get this)
token = "ya29.a0AfB_byDZtaJAL1q82s0NRqdehIndT606pT31440NjAxGXC2jpCgvn4qNOOxEKc6quW8BiIUwf-doRl9ULK7xnYdEzJbOJ9vzIeU4xCzj7CGysAH7qJaDpBT5Lh0by1HX4W2yoo9vSxFbJIBWc2mDQH8627CiIWqAaGtlnJ7ofrAaCgYKAaoSARMSFQHGX2MicOR9s77LtAJjEIdpQ_hHsQ0178"


#############################################
## Execution
#############################################

upload_results = upload_to_duos(input_file_gcs_path, token)
df_results = pd.DataFrame(upload_results, columns = ["Item", "Status", "Message"])
print("\nUpload Results:")
display(df_results)


Downloading input file gs://fc-2a9eefc3-0302-427f-9ac3-82f078741c03/dataset_metadata/anvil_dataset_metadata_20231214.txt...
Building study and dataset lookup dicts from DUOS...
Parsing input file and formatting into DUOS schema...
Parsing and formatting metadata for snapshot 5184edeb-81f8-406b-926a-64604090904e from the input file. Target study is: 1000Genomes
Parsing and formatting metadata for snapshot 4d39a01d-0ed6-42b5-9200-91b0d848a42b from the input file. Target study is: AnVIL_CCDG_Broad_NP_Epilepsy_GHAKNT_GRU_WES
Parsing and formatting metadata for snapshot 8eb8326d-a74a-4bee-b4ea-b1d211114996 from the input file. Target study is: AnVIL_CCDG_Broad_NP_Epilepsy_HKOSB_GRU_WES
Parsing and formatting metadata for snapshot a1dcd80f-6390-489a-a34a-168f26690a36 from the input file. Target study is: AnVIL_CCDG_Broad_NP_Epilepsy_JPNFKA_GRU_WES
Parsing and formatting metadata for snapshot af99a317-e7a6-4e0f-88fb-f2a6c438ca5d from the input file. Target study is: AnVIL_CCDG_Broad_NP_Epilep

Study registration succeeded! Study Id: 5918
Uploading data for study AnVIL_CCDG_Broad_NP_Epilepsy_GHAKNT_GRU_WES into DUOS
Study does NOT currently exist in DUOS. Creating new study and dataset records...
Study registration succeeded! Study Id: 5919
Uploading data for study AnVIL_CCDG_Broad_NP_Epilepsy_HKOSB_GRU_WES into DUOS
Study does NOT currently exist in DUOS. Creating new study and dataset records...
Study registration succeeded! Study Id: 5920
Uploading data for study AnVIL_CCDG_Broad_NP_Epilepsy_JPNFKA_GRU_WES into DUOS
Study does NOT currently exist in DUOS. Creating new study and dataset records...
Study registration succeeded! Study Id: 5921
Uploading data for study AnVIL_CCDG_Broad_NP_Epilepsy_KENKIL_GRU_WES into DUOS
Study does NOT currently exist in DUOS. Creating new study and dataset records...
Study registration succeeded! Study Id: 5922
Uploading data for study AnVIL_CCDG_Broad_NP_Epilepsy_LEBABM_DS_Epilepsy_WES into DUOS
Study does NOT currently exist in DUOS. Creat

Study registration succeeded! Study Id: 5955
Uploading data for study Center for Common Disease Genomics [CCDG] - Inflammatory Bowel Disease (IBD) - Global Microbiome Conservancy Host Exomes (phs002205) into DUOS
Study does NOT currently exist in DUOS. Creating new study and dataset records...
Study registration succeeded! Study Id: 5956
Uploading data for study Center for Common Disease Genomics [CCDG] Neuropsychiatric: Autism Spectrum Disorder (ASD) Ð Whole Exomes (phs002502) into DUOS
Study does NOT currently exist in DUOS. Creating new study and dataset records...
Study registration succeeded! Study Id: 5957
Uploading data for study Center for Common Disease Genomics [CCDG]: Variant Calling Controls (phs002163) into DUOS
Study does NOT currently exist in DUOS. Creating new study and dataset records...
Study registration succeeded! Study Id: 5958
Uploading data for study Genetic Neuroscience: How Human Genes and Alleles Shape Neuronal Phenotypes (phs002032) into DUOS
Study does NOT 

Unnamed: 0,Item,Status,Message
0,Input File Download,Succeeded,
1,DUOS Study and Dataset Lookup Dict Creation,Succeeded,
2,Input File Formatting,Succeeded,
3,New Study Registration - 1000Genomes,Succeeded,Study registration succeeded! Study Id: 5918
4,New Study Registration - AnVIL_CCDG_Broad_NP_Epilepsy_GHAKNT_GRU_WES,Succeeded,Study registration succeeded! Study Id: 5919
5,New Study Registration - AnVIL_CCDG_Broad_NP_Epilepsy_HKOSB_GRU_WES,Succeeded,Study registration succeeded! Study Id: 5920
6,New Study Registration - AnVIL_CCDG_Broad_NP_Epilepsy_JPNFKA_GRU_WES,Succeeded,Study registration succeeded! Study Id: 5921
7,New Study Registration - AnVIL_CCDG_Broad_NP_Epilepsy_KENKIL_GRU_WES,Succeeded,Study registration succeeded! Study Id: 5922
8,New Study Registration - AnVIL_CCDG_Broad_NP_Epilepsy_LEBABM_DS_Epilepsy_WES,Succeeded,Study registration succeeded! Study Id: 5923
9,New Study Registration - AnVIL_CCDG_Broad_NP_Epilepsy_LEBABM_GRU_WES,Succeeded,Study registration succeeded! Study Id: 5924


## Testing

In [86]:
input_file = "gs://fc-2a9eefc3-0302-427f-9ac3-82f078741c03/dataset_metadata/dataset_metadata_1.txt"
token = "ya29.a0AfB_byDTAjxdDOD3uehlSc89iGlnT3IvhteIM0J0XEn9xNE2tW75KXokeLNTpCzjE909nQuKy3eRgs-oQ4GM6UpwzkyzgdDEoA-a3N_2oFq4c1V_ER8z-QTwipH5Zz09w4k_H-t8vdUKotABaY0vJrfZeNBP3CRG2gFdSkWr7gaCgYKAR0SARMSFQHGX2Mifw3ecrSERdjCU16UthJy1g0177"

# Pull down specified file from the cloud
results_log = []
print(f"Downloading input file {input_file}...")
try:
    input_df = pd.read_csv(input_file_gcs_path, delimiter = "\t", encoding='unicode_escape')
    input_df = input_df.astype(object).where(pd.notnull(input_df),None)
    input_df.fillna("",inplace=True)
    input_dict = input_df.to_dict(orient="records")
    results_log.append(["Input File Download", "Succeeded", ""])
except Exception as e:
    msg = f"Error downloading input file ({input_file}): {str(e)}"
    results_log.append(["Input File Download", "Failed", msg])
    print(msg)
#     return results_log

# Pull a list of existing datasets and studies from DUOS and build lookup dicts
print("Building study and dataset lookup dicts from DUOS...")
try:
    datasets = requests.get(
        url=f"https://consent.dsde-dev.broadinstitute.org/api/dataset/v2?asCustodian=false",
        headers={"Authorization": f"Bearer {token}"}
    ).json()
    study_lookup = {}
    dataset_lookup = {}
    for dataset_entry in datasets:
        if dataset_entry["study"].get("name"):
            if not study_lookup.get(dataset_entry["study"]["name"]):
                study_lookup[dataset_entry["study"]["name"]] = dataset_entry["study"]["studyId"]
        if dataset_entry.get("name"):
            dataset_lookup[dataset_entry["name"]] = dataset_entry["dataSetId"]
    results_log.append(["DUOS Study and Dataset Lookup Dict Creation", "Succeeded", ""])
except Exception as e:
    msg = f"Error building study and dataset lookups: {str(e)}"
    results_log.append(["DUOS Study and Dataset Lookup Dict Creation", "Failed", msg])
    print(msg)
#     return results_log

# Parse and build DUOS schema for inputted file
print("Parsing input file and formatting into DUOS schema...")
try:
    # Determine data submitter id
    response = requests.get(
        url=f"https://consent.dsde-dev.broadinstitute.org/api/user/me",
        headers={"Authorization": f"Bearer {token}"}
    ).json()
    data_submitter_id = response["userId"]
    # Build dictionary for upload
    upload_dict = {}
    for input_entry in input_dict:
        snapshot_id = input_entry["snapshot_id"]
        study_name = input_entry["studyName"]
        consent_group_name = input_entry["consentGroups.consentGroupName"]
        print(f"Parsing and formatting metadata for snapshot {snapshot_id} from the input file. Target study is: {study_name}")
        study_id = study_lookup.get(study_name)
        dataset_id = dataset_lookup.get(consent_group_name)
        if study_id and dataset_id:
            consent_group_dict = {
                        "consentGroupName": consent_group_name,
                        "datasetId": dataset_id,
                        "numberOfParticipants": input_entry["consentGroups.numberOfParticipants"],
                        "dataLocation": input_entry["consentGroups.dataLocation"],
                        "url": input_entry["consentGroups.url"],
                        "fileTypes": []
                        #"fileTypes": format_file_types(input_entry["consentGroups.fileTypes.fileType"], input_entry["consentGroups.fileTypes.functionalEquivalence"]) --> Enumeration, exclude for now
                }
        else:
            consent_group_dict = {
                        "consentGroupName": consent_group_name,
                        "dataAccessCommitteeId": 3,
                        "accessManagement": input_entry["consentGroups.accessManagement"],
                        "numberOfParticipants": input_entry["consentGroups.numberOfParticipants"],
                        "generalResearchUse": input_entry["consentGroups.generalResearchUse"],
                        "hmb": input_entry["consentGroups.hmb"],
                        "diseaseSpecificUse": format_list(input_entry["consentGroups.diseaseSpecificUse"], 0),
                        "gs": input_entry["consentGroups.gs"],
                        "poa": input_entry["consentGroups.poa"],
                        "nmds": input_entry["consentGroups.nmds"],
                        "gso": input_entry["consentGroups.gso"],
                        "pub": input_entry["consentGroups.pub"],
                        "col": input_entry["consentGroups.col"],
                        "irb": input_entry["consentGroups.irb"],
                        "npu": input_entry["consentGroups.npu"],
                        "otherPrimary": input_entry["consentGroups.otherPrimary"],
                        "otherSecondary": input_entry["consentGroups.otherSecondary"],
                        #"mor": input_entry["consentGroups.mor"], --> Date formatting validation for morDate, exclude for now
                        #"morDate": input_entry["consentGroups.morDate"], --> Date formatting validation, exclude for now
                        "dataLocation": input_entry["consentGroups.dataLocation"],
                        "url": input_entry["consentGroups.url"],
                        "fileTypes": []
                        #"fileTypes": format_file_types(input_entry["consentGroups.fileTypes.fileType"], input_entry["consentGroups.fileTypes.functionalEquivalence"]) --> Enumeration, exclude for now
                }
        study_dict = {}
        consent_group_list = []
        if study_name not in upload_dict.keys():
            consent_group_list.append(consent_group_dict)
            study_dict = {
                "studyName": study_name,
                #"studyType": input_entry["studyType"], --> Enumeration, exclude for now
                "studyDescription": input_entry["studyDescription"],
                "dataTypes": format_list(input_entry["dataTypes"], 1),
                "phenotypeIndication": input_entry["phenotypeIndication"],
                "species": input_entry["species"],
                "piName": input_entry["piName"],
                "dataSubmitterUserId": data_submitter_id,
                "dataCustodianEmail": format_list(input_entry["dataCustodianEmail"], 0),
                "publicVisibility": input_entry["publicVisibility"],
                "nihAnvilUse": input_entry["nihAnvilUse"],
                "submittingToAnvil": input_entry["submittingToAnvil"],
                "dbGaPPhsID": input_entry["dbGaPPhsID"],
                "dbGaPStudyRegistrationName": input_entry["studyName"],
                #"embargoReleaseDate": input_entry["embargoReleaseDate"], --> Date formatting validation, exclude for now
                "sequencingCenter": input_entry["sequencingCenter"],
                "piEmail": input_entry["piEmail"],
                #"piInstitution": input_entry["piInstitution"], --> Integer ID for registered institutions, exclude for now
                "piInstitution": 0,
                "nihGrantContractNumber": "Unknown", # Required currently
                "nihICsSupportingStudy": format_list(input_entry["nihICsSupportingStudy"], 0),
                "nihProgramOfficerName": input_entry["nihProgramOfficerName"],
                "nihInstitutionCenterSubmission": input_entry["nihInstitutionCenterSubmission"],
                "nihInstitutionalCertificationFileName": input_entry["nihInstitutionalCertificationFileName"],
                "nihGenomicProgramAdministratorName": input_entry["nihGenomicProgramAdministratorName"],
                "collaboratingSites": format_list(input_entry["collaboratingSites"], 0),
                "alternativeDataSharingPlan": input_entry["alternativeDataSharingPlan"],
                "consentGroups": consent_group_list
            }
            upload_dict[study_name] = study_dict
        else:
            for consent_group in upload_dict[study_name]["consentGroups"]:
                if consent_group["consentGroupName"] != consent_group_dict["consentGroupName"]:
                    consent_group_list.append(consent_group)
            consent_group_list.append(consent_group_dict)
            study_dict = {
                "studyName": study_name,
                #"studyType": upload_dict[study_name]["studyType"], --> Enumeration, exclude for now
                "studyDescription": upload_dict[study_name]["studyDescription"],
                "dataTypes": upload_dict[study_name]["dataTypes"],
                "phenotypeIndication": upload_dict[study_name]["phenotypeIndication"],
                "species": upload_dict[study_name]["species"],
                "piName": upload_dict[study_name]["piName"],
                "dataSubmitterUserId": upload_dict[study_name]["dataSubmitterUserId"],
                "dataCustodianEmail": upload_dict[study_name]["dataCustodianEmail"],
                "publicVisibility": upload_dict[study_name]["publicVisibility"],
                "nihAnvilUse": upload_dict[study_name]["nihAnvilUse"],
                "submittingToAnvil": upload_dict[study_name]["submittingToAnvil"],
                "dbGaPPhsID": upload_dict[study_name]["dbGaPPhsID"],
                "dbGaPStudyRegistrationName": upload_dict[study_name]["studyName"],
                #"embargoReleaseDate": upload_dict[study_name]["embargoReleaseDate"], --> Date formatting validation, exclude for now
                "sequencingCenter": upload_dict[study_name]["sequencingCenter"],
                "piEmail": upload_dict[study_name]["piEmail"],
                #"piInstitution": upload_dict[study_name]["piInstitution"], --> Integer ID for registered institutions, exclude for now
                "piInstitution": upload_dict[study_name]["piInstitution"],
                "nihGrantContractNumber": upload_dict[study_name]["nihGrantContractNumber"],
                "nihICsSupportingStudy": upload_dict[study_name]["nihICsSupportingStudy"],
                "nihProgramOfficerName": upload_dict[study_name]["nihProgramOfficerName"],
                "nihInstitutionCenterSubmission": upload_dict[study_name]["nihInstitutionCenterSubmission"],
                "nihInstitutionalCertificationFileName": upload_dict[study_name]["nihInstitutionalCertificationFileName"],
                "nihGenomicProgramAdministratorName": upload_dict[study_name]["nihGenomicProgramAdministratorName"],
                "collaboratingSites": upload_dict[study_name]["collaboratingSites"],
                "alternativeDataSharingPlan": upload_dict[study_name]["alternativeDataSharingPlan"],
                "consentGroups": consent_group_list
            }
            upload_dict[study_name] = study_dict
    results_log.append(["Input File Formatting", "Succeeded", ""])
except Exception as e:
    msg = f"Error parsing and formatting input file: {str(e)}"
    results_log.append(["Input File Formatting", "Failed", msg])
    print(msg)
#     return results_log

# Loop through studies and dataset to upload
for study in upload_dict.keys():
    print(f"Uploading data for study {study} into DUOS")
    # For studies that don't exist in DUOS, create a new study
    if not study_lookup.get(study):
        print("Study does NOT currently exist in DUOS. Creating new study and dataset records...")
        try:
            new_study_response = requests.post(
                url=f"https://consent.dsde-dev.broadinstitute.org/api/dataset/v3",
                headers={"Authorization": f"Bearer {token}"},
                files = {
                    "dataset": json.dumps(upload_dict[study]),
                    "alternativeDataSharingPlan": "",
                    "consentGroups[0].nihInstitutionalCertificationFile": ""  
                }
            ).json()
            if new_study_response.get("studyId"):
                study_id = new_study_response["studyId"]
                msg = f"Study registration succeeded! Study Id: {study_id}"
                results_log.append([f"New Study Registration - {study}", "Succeeded", msg])
                print(msg)
            else:
                err_msg = new_study_response["message"]
                msg = f"Study registration failed: {err_msg}"
                results_log.append([f"New Study Registration - {study}", "Failed", msg])
                print(msg)
        except Exception as e:
            msg = f"Study registration failed: {str(e)}"
            results_log.append([f"New Study Registration - {study}", "Failed", msg])
            print(msg)

    # For studies that already exist in DUOS, lookup the study ID and update the existing study
    else:
        print("Study currently exists in DUOS. Updating study and dataset records...")
        try:
            # Add dataset IDs for existing datasets to avoid validation failures
            temp_dict = upload_dict[study].copy()
#                 updated_consent_group_list = []
#                 for consent_group in temp_dict["consentGroups"]:
#                     if consent_group["consentGroupName"] in dataset_lookup.keys():
#                         temp_cg = consent_group.copy()
#                         temp_cg["datasetId"] = dataset_lookup.get(consent_group["consentGroupName"])
#                         updated_consent_group_list.append(temp_cg)
#                     else:
#                         updated_consent_group_list.append(consent_group)
#                 temp_dict["consentGroups"] = updated_consent_group_list
            # Update study in DUOS
            study_id = study_lookup.get(study)
            update_study_response = requests.put(
                url=f"https://consent.dsde-dev.broadinstitute.org/api/dataset/study/{study_id}",
                headers={"Authorization": f"Bearer {token}"},
                files = {
                    "dataset": json.dumps(temp_dict),
                    "alternativeDataSharingPlan": "",
                    "consentGroups[0].nihInstitutionalCertificationFile": ""  
                }
            ).json()   
            if update_study_response.get("studyId"):
                study_id = update_study_response["studyId"]
                msg = f"Study registration succeeded! Study Id: {study_id}"
                results_log.append([f"New Study Registration - {study}", "Succeeded", msg])
                print(msg)
            else:
                err_msg = update_study_response["message"]
                msg = f"Study registration failed: {err_msg}"
                results_log.append([f"New Study Registration - {study}", "Failed", msg])
                print(msg)
        except Exception as e:
            msg = f"Study registration failed: {str(e)}"
            results_log.append([f"Study Registration Update - {study}", "Failed", msg])
            print(msg)

Downloading input file gs://fc-2a9eefc3-0302-427f-9ac3-82f078741c03/dataset_metadata/dataset_metadata_1.txt...
Building study and dataset lookup dicts from DUOS...
Parsing input file and formatting into DUOS schema...
Parsing and formatting metadata for snapshot 5184edeb-81f8-406b-926a-64604090904e from the input file. Target study is: 1000Genomes
Parsing and formatting metadata for snapshot 02d25240-823f-4b1d-8562-95385716a453 from the input file. Target study is: Genomic Answers for Kids (GA4K)
Parsing and formatting metadata for snapshot 1974a21b-c409-4736-a3d7-e195fa96c4eb from the input file. Target study is: Genomic Answers for Kids (GA4K)
Parsing and formatting metadata for snapshot 99b46287-4790-492c-8a12-bea33f0f927c from the input file. Target study is: Genomic Answers for Kids (GA4K)
Parsing and formatting metadata for snapshot 08d19a7e-b868-4766-9f7e-d879d972cbd7 from the input file. Target study is: Genomic Answers for Kids (GA4K)
Parsing and formatting metadata for snapsh

In [87]:
update_study_response

{'message': 'Cannot invoke "java.util.Set.stream()" because the return value of "org.broadinstitute.consent.http.models.Study.getDatasets()" is null',
 'code': 500}

# Script Development

## Fetch parameters from snapshot/dataset

In [None]:
# Parameters
snapshot_id = "099d2585-1379-4333-b3b1-ffc0d26d95c5"

# Retrieve snapshot details
api_client = refresh_tdr_api_client()
datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
snapshots_api = data_repo_client.SnapshotsApi(api_client=api_client)
snapshot_details = snapshots_api.retrieve_snapshot(id=snapshot_id).to_dict()
dataset_id = snapshot_details["source"][0]["dataset"]["id"]
phs_id = snapshot_details["source"][0]["dataset"]["phs_id"]

# Retrieve dataset details
dataset_details = datasets_api.retrieve_dataset(id=dataset_id, include=["PROPERTIES"]).to_dict()
if dataset_details["properties"].get("auth_domains"):
    auth_domain = dataset_details["properties"]["auth_domains"][0]
if dataset_details["properties"].get("source_workspaces"):
    source_workspace = dataset_details["properties"]["source_workspaces"][0]

# Print output
print(phs_id)
print(source_workspace)

## Pulling Workspace Attributes

In [None]:
# Parameters
ws_project = "anvil-datastorage"
ws_name = "AnVIL_GREGOR_RELEASE_01_HMB"

# Establish credentials
creds, project = google.auth.default()
auth_req = google.auth.transport.requests.Request()
creds.refresh(auth_req)

# Pull workspace attributes
ws_attributes = requests.get(
    url=f"https://api.firecloud.org/api/workspaces/{ws_project}/{ws_name}?fields=workspace.attributes,workspace.authorizationDomain,workspace.googleProject,workspace.bucketName",
    headers={"Authorization": f"Bearer {creds.token}"}
).json()

# Map to schema
terra_dict = {}
terra_dict["studyName"] = ws_attributes["workspace"]["attributes"].get("library:projectName")
terra_dict["studyType"] = ws_attributes["workspace"]["attributes"].get("library:studyDesign")
#terra_dict["studyDescription"] = ws_attributes["workspace"]["attributes"].get("description")
terra_dict["dataTypes"] = ws_attributes["workspace"]["attributes"].get("library:dataCategory")["items"]
terra_dict["phenotypeIndication"] = ws_attributes["workspace"]["attributes"].get("library:indication")
terra_dict["species"] = "Homo sapiens"
terra_dict["piName"] = ws_attributes["workspace"]["attributes"].get("library:datasetOwner")
terra_dict["dataCustodianEmail"] = ws_attributes["workspace"]["attributes"].get("library:contactEmail")
if ws_attributes["workspace"]["attributes"].get("tag:tags"):
    for tag in ws_attributes["workspace"]["attributes"].get("tag:tags")["items"]:
        if "Consortium:" in tag:
            terra_dict["consortium"] = tag.split(":")[1].strip()
        elif "dbGaP:" in tag:
            terra_dict["dbGaPPhsID"] = tag.split(":")[1].strip()
terra_dict["consentGroups.consentCode"] = ws_attributes["workspace"]["attributes"]["library:dataUseRestriction"] 
terra_dict["consentGroups.fileTypes.fileType"] = ws_attributes["workspace"]["attributes"]["library:datatype"]["items"]

# View schema
print(terra_dict)


In [None]:
ws_attributes

In [None]:
ws_attributes

## dbGaP XML Parse

In [None]:
# Parameters
phs_id = "phs003047"
#phs_id = "phs000693"

# Pull and parse XML
phs_short = phs_id.replace("phs", "")
dbgap_url = "https://dbgap.ncbi.nlm.nih.gov/ss/dbgapssws.cgi?request=Study&phs=" + phs_short
response = requests.get(url=dbgap_url)
xml_data = xmltodict.parse(response.text)

# Map to schema
dbgap_xml_dict = {}
if isinstance(xml_data["dbgapss"]["Study"], list):
    study_data = xml_data["dbgapss"]["Study"][0]
else:
    study_data = xml_data["dbgapss"]["Study"] 
dbgap_xml_dict["studyName"] = study_data["StudyInfo"].get("StudyNameEntrez")
dbgap_xml_dict["studyDescription"] = study_data["StudyInfo"].get("Description")
dbgap_xml_dict["dbGaPPhsID"] = phs_id
dbgap_xml_dict["dbGaPStudyRegistrationName"] = study_data["StudyInfo"].get("StudyNameEntrez")
for ap_entry in study_data["Authority"]["Persons"]["Person"]:
    if ap_entry["Role"] == "PI":
        dbgap_xml_dict["piName"] = ap_entry["@lname"] + ", " + ap_entry["@fname"]
        dbgap_xml_dict["piEmail"] = ap_entry["@email"]
        dbgap_xml_dict["piInstitution"] = ap_entry["Organization"]
    elif ap_entry["Role"] == "PO" and ap_entry["Organization"] == "NIH":
        dbgap_xml_dict["nihProgramOfficerName"] = ap_entry["@lname"] + ", " + ap_entry["@fname"]
    elif ap_entry["Role"] == "GPA" and ap_entry["Organization"] == "NIH":
        dbgap_xml_dict["nihGenomicProgramAdministratorName"] = ap_entry["@lname"] + ", " + ap_entry["@fname"]
ic_list = []
if isinstance(study_data["Authority"]["ICs"]["IC"], list):
    for ic_entry in study_data["Authority"]["ICs"]["IC"]:
        ic_list.append(ic_entry["@name"])
else:
    ic_list.append(study_data["Authority"]["ICs"]["IC"]["@name"])
dbgap_xml_dict["nihICsSupportingStudy"] = ic_list
dbgap_xml_dict["numberOfParticipants"] = study_data.get("@num_participants")
dbgap_xml_dict["embargoReleaseDate"] = study_data["Policy"].get("@pub-embargo")

# View schema
print(dbgap_xml_dict)


In [None]:
study_data

In [None]:
study_data

## dbGaP Study API

In [None]:
# Parameters
study_uid = 483191234

# Pull and parse JSON
dbgap_study_url = "https://submit.ncbi.nlm.nih.gov/dbgap/api/v1/study_config/" + str(study_uid)
response = requests.get(url=dbgap_study_url)
study_api_data = json.loads(response.text)

# Map to schema
dbgap_study_api_dict = {}
if study_api_data.get("error") == None:
    dbgap_study_api_dict["studyName"] = study_api_data["data"].get("report_name")
    dbgap_study_api_dict["studyDescription"] = study_api_data["data"].get("description")
    dbgap_study_api_dict["phenotypeIndication"] = study_api_data["data"].get("primary_disease")
    dbgap_study_api_dict["studyType"] = study_api_data["data"].get("study_design")
    for attr_entry in study_api_data["data"].get("attribution"):
        if attr_entry.get("title") == "Principal Investigator":
            dbgap_study_api_dict["piName"] = attr_entry.get("name")
            dbgap_study_api_dict["piInstitution"] = attr_entry.get("institute")
            break

# View schema
print(dbgap_study_api_dict)

In [None]:
study_api_data

## dbGaP FHIR API

In [None]:
# Parameters
#phs_id = "phs003047"
phs_id = "phs000693"

# Pull and parse JSON
dbgap_fhir_url = "https://dbgap-api.ncbi.nlm.nih.gov/fhir/x1/ResearchStudy?_format=json&_id=" + phs_id
response = requests.get(url=dbgap_fhir_url)
fhir_data = json.loads(response.text)

# Map to schema
dbgap_fhir_dict = {}
dbgap_fhir_dict["studyName"] = fhir_data["entry"][0]["resource"].get("title")
dbgap_fhir_dict["studyDescription"] = fhir_data["entry"][0]["resource"].get("description")
dbgap_fhir_dict["dbGaPPhsID"] = phs_id
dbgap_fhir_dict["dbGaPStudyRegistrationName"] = fhir_data["entry"][0]["resource"].get("title")
dbgap_fhir_dict["nihICsSupportingStudy"] = fhir_data["entry"][0]["resource"]["sponsor"].get("display")
# studyType
for cat_entry in fhir_data["entry"][0]["resource"].get("category"):
    for coding_entry in cat_entry.get("coding"):
        if coding_entry.get("system") == "https://dbgap-api.ncbi.nlm.nih.gov/fhir/x1/CodeSystem/ResearchStudy-StudyDesign":
            value = coding_entry.get("display") if coding_entry.get("display") else coding_entry.get("code")
            if dbgap_fhir_dict.get("studyType") and value:
                dbgap_fhir_dict["studyType"] += f", {value}"
            elif value:
                dbgap_fhir_dict["studyType"] = value
# dataTypes
dt_list = []
for ext_entry in fhir_data["entry"][0]["resource"].get("extension"):
    if ext_entry.get("url") == "https://dbgap-api.ncbi.nlm.nih.gov/fhir/x1/StructureDefinition/ResearchStudy-MolecularDataTypes":
        for inner_ext_entry in ext_entry.get("extension"):
            if inner_ext_entry.get("url") == "https://dbgap-api.ncbi.nlm.nih.gov/fhir/x1/StructureDefinition/ResearchStudy-MolecularDataTypes-MolecularDataType":
                for coding_entry in inner_ext_entry["valueCodeableConcept"].get("coding"):
                    dt_list.append(coding_entry.get("code"))
dbgap_fhir_dict["dataTypes"] = dt_list
# phenotypeIndication
for focus_entry in fhir_data["entry"][0]["resource"].get("focus"):
    for coding_entry in focus_entry.get("coding"):
        value = coding_entry.get("display") if coding_entry.get("display") else coding_entry.get("code")
        if dbgap_fhir_dict.get("phenotypeIndication") and value:
            dbgap_fhir_dict["phenotypeIndication"] += f", {value}"
        elif value:
            dbgap_fhir_dict["phenotypeIndication"] = value
# numberOfParticipants
for ext_entry in fhir_data["entry"][0]["resource"].get("extension"):
    if ext_entry.get("url") == "https://dbgap-api.ncbi.nlm.nih.gov/fhir/x1/StructureDefinition/ResearchStudy-Content":
        for inner_ext_entry in ext_entry.get("extension"):
            if inner_ext_entry.get("url") == "https://dbgap-api.ncbi.nlm.nih.gov/fhir/x1/StructureDefinition/ResearchStudy-Content-NumSubjects":
                dbgap_fhir_dict["numberOfParticipants"] = inner_ext_entry["valueCount"].get("code")

# View schema
print(dbgap_fhir_dict)

In [None]:
fhir_data

# DUOS Load

# Utilities

## Delete Studies from DUOS

In [108]:
# Inputs
token = "ya29.a0AfB_byAm_jdIP_OjUewqvX_GmDcapjF4wxeRuDs_SEytpi_Z-ebuH4dGI_I4SJiojb_fF-sLP-nE29uhHl9c5KK2-bp1KM_XuPTBRWBddINlwijoJWjQs0LdD5nZ0D0LjPEVkVJOeEGzHgC46qOOgh74Wr1I_kZ-zS1ZAbqEAKwaCgYKAeESARMSFQHGX2MibOjNTp9U33mTJt6rW3H2dg0178"
study_id_list = [
    
]

# Delete studies
for study_id in study_id_list:
    print(f"Deleting study ID {study_id}")
    response = requests.delete(
        url=f"https://consent.dsde-dev.broadinstitute.org/api/dataset/study/{study_id}",
        headers={"Authorization": f"Bearer {token}"} 
    )
    if response.status_code == 200:
        print("Study deleted successfully.")
    else:
        msg = response.json()["message"]
        print(f"Error deleting study: {msg}")
    

Deleting study ID 5903
Study deleted successfully.
Deleting study ID 5905
Study deleted successfully.
Deleting study ID 5901
Study deleted successfully.
Deleting study ID 5906
Study deleted successfully.
Deleting study ID 5907
Study deleted successfully.
Deleting study ID 5910
Study deleted successfully.
Deleting study ID 5911
Study deleted successfully.
Deleting study ID 5913
Study deleted successfully.
Deleting study ID 5916
Study deleted successfully.
Deleting study ID 5917
Study deleted successfully.
Deleting study ID 5912
Study deleted successfully.
Deleting study ID 5915
Study deleted successfully.
Deleting study ID 5914
Study deleted successfully.
Deleting study ID 5866
Study deleted successfully.
Deleting study ID 5867
Study deleted successfully.
Deleting study ID 5868
Study deleted successfully.
Deleting study ID 5869
Study deleted successfully.
Deleting study ID 5870
Study deleted successfully.
Deleting study ID 5873
Study deleted successfully.
Deleting study ID 5871
Study de

## Build Lookup of Datasets and Studies in DUOS

In [99]:
# Inputs
token = "ya29.a0AfB_byAm_jdIP_OjUewqvX_GmDcapjF4wxeRuDs_SEytpi_Z-ebuH4dGI_I4SJiojb_fF-sLP-nE29uhHl9c5KK2-bp1KM_XuPTBRWBddINlwijoJWjQs0LdD5nZ0D0LjPEVkVJOeEGzHgC46qOOgh74Wr1I_kZ-zS1ZAbqEAKwaCgYKAeESARMSFQHGX2MibOjNTp9U33mTJt6rW3H2dg0178"
user_id = 5100 # Set to None to return all datasets/studies, otherwise will filter on those created or updated by the specified user

# Pull a list of existing datasets and studies from DUOS and build lookup dicts
datasets = requests.get(
    url=f"https://consent.dsde-dev.broadinstitute.org/api/dataset/v2?asCustodian=false",
    headers={"Authorization": f"Bearer {token}"}
).json()
study_lookup = {}
dataset_lookup = {}
if user_id:
    for dataset_entry in datasets:
        created_user = dataset_entry.get("createUserId") if dataset_entry.get("createUserId") else 0
        updated_user = dataset_entry.get("updateUserId") if dataset_entry.get("createUserId") else 0
        if dataset_entry["study"].get("name") and (created_user == user_id or updated_user == user_id):
            if not study_lookup.get(dataset_entry["study"]["name"]):
                study_lookup[dataset_entry["study"]["name"]] = dataset_entry["study"]["studyId"]
        if dataset_entry.get("name") and (created_user == user_id or updated_user == user_id):
            dataset_lookup[dataset_entry["name"]] = dataset_entry["dataSetId"]
else:
    for dataset_entry in datasets:
        if dataset_entry["study"].get("name"):
            if not study_lookup.get(dataset_entry["study"]["name"]):
                study_lookup[dataset_entry["study"]["name"]] = dataset_entry["study"]["studyId"]
        if dataset_entry.get("name"):
            dataset_lookup[dataset_entry["name"]] = dataset_entry["dataSetId"]    

In [106]:
study_list = []
for key, val in study_lookup.items():
    study_list.append(val)   

In [107]:
study_list

[5903,
 5905,
 5901,
 5906,
 5907,
 5910,
 5911,
 5913,
 5916,
 5917,
 5912,
 5915,
 5914,
 5866,
 5867,
 5868,
 5869,
 5870,
 5873,
 5871,
 5872,
 5874,
 5875,
 5881,
 5882,
 5883,
 5876,
 5877,
 5878,
 5880,
 5879,
 5884,
 5885,
 5886,
 5889,
 5892,
 5893,
 5897,
 5887,
 5890,
 5894,
 5895,
 5896,
 5898,
 5900,
 5888,
 5891,
 5902,
 5909,
 5899,
 5904,
 5908]

In [100]:
dataset_lookup

{'ANVIL_CCDG_Broad_NP_Epilepsy_LEBABM_GRU_WES_20230110_ANV5_202304242243': 2111,
 'ANVIL_CCDG_Broad_NP_Epilepsy_USAMON_GRU_WES_20221205_ANV5_202304242246': 2113,
 'ANVIL_CCDG_Broad_NP_Epilepsy_KENKIL_GRU_WES_20230110_ANV5_202304242241': 2109,
 'ANVIL_CMG_Broad_Muscle_OGrady_WES_20221205_ANV5_202304242252': 2114,
 'ANVIL_CMG_Broad_Orphan_Estonia_Ounap_WGS_20221205_ANV5_202304242255': 2115,
 'ANVIL_CMG_Broad_Eye_Pierce_WGS_20221117_ANV5_202304241507': 2118,
 'ANVIL_CMG_Broad_Muscle_Bonnemann_WES_20221117_ANV5_202304241509': 2119,
 'ANVIL_CMG_Broad_Muscle_Bonnemann_WGS_20221117_ANV5_202304241510': 2120,
 'ANVIL_CMG_Broad_Orphan_Estonia_Ounap_WES_20221117_ANV5_202304241512': 2121,
 'ANVIL_CMG_Broad_Orphan_Manton_WES_20221117_ANV5_202304241513': 2122,
 'ANVIL_CMG_Broad_Orphan_Manton_WGS_20221117_ANV5_202304241515': 2123,
 'ANVIL_ccdg_broad_daly_igsr_1kg_twist_gsa_20221202_ANV5_202304271343': 2124,
 'ANVIL_CMG_Broad_Eye_Pierce_WES_20221205_ANV5_202304242250': 2141,
 'ANVIL_NIMH_Broad_WGSPD1_