# Imports

In [None]:
#!pip install --upgrade data_repo_client
#!pip install --upgrade xmltodict

In [2]:
# Imports
import requests
import json
import google.auth
import xmltodict
import data_repo_client
import pandas as pd
import re
from time import sleep
import ast

# Function to refresh TDR API client
def refresh_tdr_api_client():
    creds, project = google.auth.default()
    auth_req = google.auth.transport.requests.Request()
    creds.refresh(auth_req)
    config = data_repo_client.Configuration()
    config.host = "https://data.terra.bio"
    config.access_token = creds.token
    api_client = data_repo_client.ApiClient(configuration=config)
    api_client.client_side_validation = False
    return api_client

# Display options
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option("display.max_colwidth", None)
pd.set_option('display.width', 1000)
pd.set_option('display.colheader_justify', 'center')
pd.set_option('display.precision', 3)

# Step 1: Collect Metadata for Review

In [16]:
#############################################
## Functions
#############################################

def coalesce(*arg): 
    remove_list = ["", "NA", "N/A", "NONE", "TBD", "UNKNOWN", "UNSPECIFIED"]
    # update to remove N/A, None, Null, TBD
    for input_item in arg:
        if input_item is not None:
            if isinstance(input_item, list):
                temp_list = [ele for ele in input_item if ele is not None and ele.upper() not in remove_list]
                if temp_list:
                    return temp_list
                else:
                    return []
            else:
                if str(input_item).upper() not in remove_list:
                    return input_item
    return None

def format_description(input_string):
    output_string = input_string if input_string else ""
    output_string = re.sub("\n\n\t", " ", output_string)
    output_string = re.sub("\t", " ", output_string)
    output_string = re.sub("study.cgi\?study_id=|.\/study.cgi\?study_id=", "https://www.ncbi.nlm.nih.gov/projects/gap/cgi-bin/study.cgi?study_id=", output_string)
    return output_string

def format_phs_id(input_str):
    try:
        num = re.search("phs0*([0-9]+)", input_str, re.IGNORECASE).group(1)
    except:
        num = ""
    if num:
        output_str = "phs" + str(num).zfill(6)
    else:
        output_str = ""
    return output_str

def try_join(l):
    try:
        if isinstance(l, list):
            return ', '.join(map(str, l))
        else:
            return l
    except TypeError:
        return l
    
def val_study_type_enum(l):
    if l and l not in ["Observational", "Interventional", "Descriptive", "Analytical", "Prospective", "Retrospective", "Case report", "Case series", "Cross-sectional", "Cohort study"]:
        return 1
    else:
        return 0

def val_nih_inst_center_sub_enum(l):
    if l and l not in ["NCI", "NEI", "NHLBI", "NHGRI", "NIA", "NIAAA", "NIAID", "NIAMS", "NIBIB", "NICHD", "NIDCD", "NIDCR", "NIDDK", "NIDA", "NIEHS", "NIGMS", "NIMH", "NIMHD", "NINDS", "NINR", "NLM", "CC", "CIT", "CSR", "FIC", "NCATS", "NCCIH"]:
        return 1
    else:
        return 0

def val_nih_ic_supp_study_enum(l):
    if l and isinstance(l, list):
        for item in l:
            if item not in ["NCI", "NEI", "NHLBI", "NHGRI", "NIA", "NIAAA", "NIAID", "NIAMS", "NIBIB", "NICHD", "NIDCD", "NIDCR", "NIDDK", "NIDA", "NIEHS", "NIGMS", "NIMH", "NIMHD", "NINDS", "NINR", "NLM", "CC", "CIT", "CSR", "FIC", "NCATS", "NCCIH"]:
                return 1
        return 0
    else:
        return 0

def val_file_type_enum(l):
    if l and isinstance(l, list):
        for item in l:
            if item not in ["Arrays", "Genome", "Exome", "Survey", "Phenotype"]:
                return 1
        return 0
    else:
        return 0

def fetch_dataset_details(snapshot_id, ds_consent_map):
    
    # Initialize variables
    terra_dict = {}
    dbgap_xml_dict = {}
    dbgap_study_api_dict = {}
    dbgap_fhir_dict = {}
    final_results_dict = {}
    
    # Retrieve snapshot details
    api_client = refresh_tdr_api_client()
    datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
    snapshots_api = data_repo_client.SnapshotsApi(api_client=api_client)
    attempt_counter = 0
    while attempt_counter <= 2:
        try:
            snapshot_details = snapshots_api.retrieve_snapshot(id=snapshot_id).to_dict()
            break
        except:
            sleep(5)
            attempt_counter += 1  
    snapshot_name = snapshot_details["name"]
    dataset_id = snapshot_details["source"][0]["dataset"]["id"]
    phs_id = format_phs_id(snapshot_details["source"][0]["dataset"]["phs_id"])
    if snapshot_details["source"][0]["dataset"]["secure_monitoring_enabled"] == True:
        access_management = "controlled"
    else:
        access_management = "open"
    if snapshot_details["source"][0]["dataset_properties"].get("source_workspaces"):  
        source_workspace = snapshot_details["source"][0]["dataset_properties"]["source_workspaces"][0]
    else:
        source_workspace = None
    if snapshot_details["duos_firecloud_group"] != None:
        duos_id = snapshot_details["duos_firecloud_group"]["duos_id"]
    else:
        duos_id = None
    print("\tSnapshot PHS_ID: " + str(phs_id))
    print("\tSource Workspace: " + str(source_workspace))
    print("\tCurrent DUOS ID: " + str(duos_id))
    
    # Pull information from existing DUOS registration (if listed)
    if duos_id:
        # Establish credentials
        creds, project = google.auth.default()
        auth_req = google.auth.transport.requests.Request()
        creds.refresh(auth_req)
        
        # Pull existing DUOS registration
        duos_dict = requests.get(
            url=f"https://consent.dsde-prod.broadinstitute.org/api/dataset/registration/{duos_id}",
            headers={"Authorization": f"Bearer {creds.token}"}
        ).json()
        print(duos_dict)
    
    # Pull information from original workspace (if listed)
    if source_workspace:
        # Establish credentials
        creds, project = google.auth.default()
        auth_req = google.auth.transport.requests.Request()
        creds.refresh(auth_req)

        # Pull workspace attributes
        attempt_counter = 0
        while attempt_counter <= 2:
            try:
                ws_attributes = requests.get(
                    url=f"https://api.firecloud.org/api/workspaces/anvil-datastorage/{source_workspace}?fields=workspace.attributes,workspace.authorizationDomain,workspace.googleProject,workspace.bucketName",
                    headers={"Authorization": f"Bearer {creds.token}"}
                ).json()
                break
            except:
                sleep(5)
                attempt_counter += 1
        
        # Map to schema
        if ws_attributes.get("workspace"):
            terra_dict["studyName"] = coalesce(ws_attributes["workspace"]["attributes"].get("library:projectName"), source_workspace) 
            terra_dict["studyType"] = ws_attributes["workspace"]["attributes"].get("library:studyDesign")
            terra_dict["studyDescription"] = ws_attributes["workspace"]["attributes"].get("description")
            if ws_attributes["workspace"]["attributes"].get("library:dataCategory"):
                terra_dict["dataTypes"] = []
                for item in ws_attributes["workspace"]["attributes"]["library:dataCategory"]["items"]:
                    inner_list = item.split(",")
                    for inner_item in inner_list:
                        inner_item = inner_item.replace("'", "").strip()
                        terra_dict["dataTypes"].append(inner_item)
            terra_dict["phenotypeIndication"] = ws_attributes["workspace"]["attributes"].get("library:indication")
            terra_dict["species"] = "Homo sapiens"
            terra_dict["piName"] = ws_attributes["workspace"]["attributes"].get("library:datasetOwner")
            terra_dict["dataCustodianEmail"] = [ws_attributes["workspace"]["attributes"].get("library:contactEmail")]
            if ws_attributes["workspace"]["attributes"].get("tag:tags"):
                for tag in ws_attributes["workspace"]["attributes"].get("tag:tags")["items"]:
                    if "Consortium:" in tag:
                        terra_dict["consortium"] = tag.split(":")[1].strip()
                    elif "dbGaP:" in tag:
                        terra_dict["dbGaPPhsID"] = format_phs_id(tag.split(":")[1].strip())
                        if not phs_id:
                            phs_id = format_phs_id(tag.split(":")[1].strip()) 
            terra_dict["consentGroups.consentCode"] = ws_attributes["workspace"]["attributes"].get("library:dataUseRestriction")
            if ws_attributes["workspace"]["attributes"].get("library:datatype"):
                terra_dict["consentGroups.fileTypes.fileType"] = ws_attributes["workspace"]["attributes"]["library:datatype"]["items"]
            if ws_attributes["workspace"]["attributes"].get("library:numSubjects"):
                terra_dict["consentGroups.numberOfParticipants"] = ws_attributes["workspace"]["attributes"]["library:numSubjects"]
#         print("------------------------------------------------------")
#         print("terra_dict")
#         print(terra_dict)
        
    # Pull information from dbGaP (if phs_id listed)
#     print("PHS ID for dbGaP: " + phs_id)
    if phs_id:
        # Pull and parse XML
        phs_short = phs_id.replace("phs", "")
        dbgap_url = "https://dbgap.ncbi.nlm.nih.gov/ss/dbgapssws.cgi?request=Study&phs=" + phs_short
        attempt_counter = 0
        while attempt_counter <= 2:
            try:
                response = requests.get(url=dbgap_url)
                xml_data = xmltodict.parse(response.text)
                break
            except:
                sleep(5)
                attempt_counter += 1
        study_uid = ""

        # Map to schema
        if xml_data["dbgapss"].get("Study"):
            if isinstance(xml_data["dbgapss"]["Study"], list):
                study_data = xml_data["dbgapss"]["Study"][0]
            else:
                study_data = xml_data["dbgapss"]["Study"] 
            study_uid = study_data.get("@uid")
            dbgap_xml_dict["studyName"] = study_data["StudyInfo"].get("StudyNameEntrez")
            dbgap_xml_dict["studyDescription"] = study_data["StudyInfo"].get("Description")
            dbgap_xml_dict["dbGaPPhsID"] = phs_id
            dbgap_xml_dict["dbGaPStudyRegistrationName"] = study_data["StudyInfo"].get("StudyNameEntrez")
            if study_data["Authority"]["Persons"].get("Person"):
                for ap_entry in study_data["Authority"]["Persons"]["Person"]:
                    if ap_entry["Role"] == "PI":
                        dbgap_xml_dict["piName"] = ap_entry["@lname"] + ", " + ap_entry["@fname"]
                        dbgap_xml_dict["piEmail"] = ap_entry["@email"]
                        dbgap_xml_dict["piInstitution"] = ap_entry["Organization"]
                    elif ap_entry["Role"] == "PO" and ap_entry["Organization"] == "NIH":
                        dbgap_xml_dict["nihProgramOfficerName"] = ap_entry["@lname"] + ", " + ap_entry["@fname"]
                    elif ap_entry["Role"] == "GPA" and ap_entry["Organization"] == "NIH":
                        dbgap_xml_dict["nihGenomicProgramAdministratorName"] = ap_entry["@lname"] + ", " + ap_entry["@fname"]
            ic_list = []
            if isinstance(study_data["Authority"]["ICs"]["IC"], list):
                for ic_entry in study_data["Authority"]["ICs"]["IC"]:
                    ic_list.append(ic_entry["@name"])
            else:
                ic_list.append(study_data["Authority"]["ICs"]["IC"]["@name"])
            dbgap_xml_dict["nihICsSupportingStudy"] = ic_list
            dbgap_xml_dict["consentGroups.numberOfParticipants"] = study_data.get("@num_participants")
            dbgap_xml_dict["embargoReleaseDate"] = study_data["Policy"].get("@pub-embargo")
#             print("------------------------------------------------------")
#             print("dbgap_xml_dict")
#             print(dbgap_xml_dict)
        
        # Pull and parse Study API JSON
        if study_uid:
            dbgap_study_url = "https://submit.ncbi.nlm.nih.gov/dbgap/api/v1/study_config/" + str(study_uid)
            attempt_counter = 0
            while attempt_counter <= 2:
                try:
                    response = requests.get(url=dbgap_study_url)
                    study_api_data = json.loads(response.text)
                    break
                except:
                    sleep(5)
                    attempt_counter += 1
            
            # Map to schema
            if study_api_data.get("error") == None:
                dbgap_study_api_dict["studyName"] = study_api_data["data"].get("report_name")
                dbgap_study_api_dict["studyDescription"] = study_api_data["data"].get("description")
                dbgap_study_api_dict["phenotypeIndication"] = study_api_data["data"].get("primary_disease")
                dbgap_study_api_dict["studyType"] = study_api_data["data"].get("study_design")
                dbgap_study_api_dict["dbGaPPhsID"] = phs_id
                dbgap_study_api_dict["dbGaPStudyRegistrationName"] = study_api_data["data"].get("report_name")
                for attr_entry in study_api_data["data"].get("attribution"):
                    if attr_entry.get("title") == "Principal Investigator":
                        dbgap_study_api_dict["piName"] = attr_entry.get("name")
                        dbgap_study_api_dict["piInstitution"] = attr_entry.get("institute")
                        break
#             print("------------------------------------------------------")
#             print("dbgap_study_api_dict")
#             print(dbgap_study_api_dict)
        
        # Pull and parse FHIR API JSON
        dbgap_fhir_url = "https://dbgap-api.ncbi.nlm.nih.gov/fhir/x1/ResearchStudy?_format=json&_id=" + phs_id
        attempt_counter = 0
        while attempt_counter <= 2:
            try:
                response = requests.get(url=dbgap_fhir_url)
                fhir_data = json.loads(response.text)
                break
            except:
                sleep(5)
                attempt_counter += 1

        # Map to schema
        if fhir_data.get("entry"):
            dbgap_fhir_dict["studyName"] = fhir_data["entry"][0]["resource"].get("title")
            dbgap_fhir_dict["studyDescription"] = fhir_data["entry"][0]["resource"].get("description")
            dbgap_fhir_dict["dbGaPPhsID"] = phs_id
            dbgap_fhir_dict["dbGaPStudyRegistrationName"] = fhir_data["entry"][0]["resource"].get("title")
            # NIH ICs
            if "Organization/" in fhir_data["entry"][0]["resource"]["sponsor"].get("reference"):
                dbgap_fhir_dict["nihICsSupportingStudy"] = [fhir_data["entry"][0]["resource"]["sponsor"].get("reference")[13:]]
            else:
                ic_display = fhir_data["entry"][0]["resource"]["sponsor"].get("display")
                if ic_display == "National Human Genome Research Institute":
                    dbgap_fhir_dict["nihICsSupportingStudy"] = ["NHGRI"]
                else:
                    dbgap_fhir_dict["nihICsSupportingStudy"] = [ic_display]
            # studyType
            if fhir_data["entry"][0]["resource"].get("category"):
                for cat_entry in fhir_data["entry"][0]["resource"].get("category"):
                    if cat_entry.get("coding"):
                        for coding_entry in cat_entry.get("coding"):
                            if coding_entry.get("system") == "https://dbgap-api.ncbi.nlm.nih.gov/fhir/x1/CodeSystem/ResearchStudy-StudyDesign":
                                value = coding_entry.get("display") if coding_entry.get("display") else coding_entry.get("code")
                                if dbgap_fhir_dict.get("studyType") and value:
                                    dbgap_fhir_dict["studyType"] += f", {value}"
                                elif value:
                                    dbgap_fhir_dict["studyType"] = value
            # dataTypes
            dt_list = []
            if fhir_data["entry"][0]["resource"].get("extension"): 
                for ext_entry in fhir_data["entry"][0]["resource"].get("extension"):
                    if ext_entry.get("url") == "https://dbgap-api.ncbi.nlm.nih.gov/fhir/x1/StructureDefinition/ResearchStudy-MolecularDataTypes":
                        for inner_ext_entry in ext_entry.get("extension"):
                            if inner_ext_entry.get("url") == "https://dbgap-api.ncbi.nlm.nih.gov/fhir/x1/StructureDefinition/ResearchStudy-MolecularDataTypes-MolecularDataType":
                                for coding_entry in inner_ext_entry["valueCodeableConcept"].get("coding"):
                                    dt_list.append(coding_entry.get("code"))
            dbgap_fhir_dict["dataTypes"] = dt_list
            # phenotypeIndication
            if fhir_data["entry"][0]["resource"].get("focus"):
                for focus_entry in fhir_data["entry"][0]["resource"].get("focus"):
                    if focus_entry.get("coding"):
                        for coding_entry in focus_entry.get("coding"):
                            value = coding_entry.get("display") if coding_entry.get("display") else coding_entry.get("code")
                            if dbgap_fhir_dict.get("phenotypeIndication") and value:
                                dbgap_fhir_dict["phenotypeIndication"] += f", {value}"
                            elif value:
                                dbgap_fhir_dict["phenotypeIndication"] = value
            # numberOfParticipants
            if fhir_data["entry"][0]["resource"].get("extension"):
                for ext_entry in fhir_data["entry"][0]["resource"].get("extension"):
                    if ext_entry.get("url") == "https://dbgap-api.ncbi.nlm.nih.gov/fhir/x1/StructureDefinition/ResearchStudy-Content":
                        for inner_ext_entry in ext_entry.get("extension"):
                            if inner_ext_entry.get("url") == "https://dbgap-api.ncbi.nlm.nih.gov/fhir/x1/StructureDefinition/ResearchStudy-Content-NumSubjects":
                                dbgap_fhir_dict["consentGroups.numberOfParticipants"] = inner_ext_entry["valueCount"].get("code")
#         print("------------------------------------------------------")
#         print("dbgap_fhir_dict")
#         print(dbgap_fhir_dict)
    
    # Reconcile information and create final results
    consent_code = coalesce(terra_dict.get("consentGroups.consentCode"), dbgap_fhir_dict.get("consentGroups.consentCode"), dbgap_xml_dict.get("consentGroups.consentCode"), dbgap_study_api_dict.get("consentGroups.consentCode"))
    if consent_code:
        consent_code = consent_code.upper().replace("_", "-")
    else:
        consent_code = ""
    consortium = coalesce(terra_dict.get("consortium"), dbgap_fhir_dict.get("consortium"), dbgap_xml_dict.get("consortium"), dbgap_study_api_dict.get("consortium"))
    dbGaPPhsID = coalesce(dbgap_fhir_dict.get("dbGaPPhsID"), dbgap_xml_dict.get("dbGaPPhsID"), dbgap_study_api_dict.get("dbGaPPhsID"), terra_dict.get("dbGaPPhsID"))
    studyName = coalesce(dbgap_fhir_dict.get("studyName"), dbgap_xml_dict.get("studyName"), dbgap_study_api_dict.get("studyName"), terra_dict.get("studyName"))
    if dbGaPPhsID and consent_code:
        study_consent = dbGaPPhsID + ":" + consent_code
        purl_doid = ds_consent_map.get(study_consent)
    else:
        purl_doid = None
    final_results_dict["snapshot_id"] = snapshot_id
    final_results_dict["duos_id"] = duos_id
    if duos_id:
        final_results_dict["studyName"] = duos_dict.get("studyName")
        final_results_dict["studyType"] = duos_dict.get("studyType")
        final_results_dict["studyDescription"] = duos_dict.get("studyDescription")
        final_results_dict["dataTypes"] = duos_dict.get("dataTypes")
        final_results_dict["phenotypeIndication"] = duos_dict.get("phenotypeIndication")
        final_results_dict["species"] = duos_dict.get("species")
        final_results_dict["piName"] = duos_dict.get("piName")
        final_results_dict["dataCustodianEmail"] = duos_dict.get("dataCustodianEmail")
        final_results_dict["publicVisibility"] = duos_dict.get("publicVisibility")
        final_results_dict["nihAnvilUse"] = "I am NHGRI funded and I have a dbGaP PHS ID already" if 'already' in duos_dict.get("nihAnvilUse").lower() else "I am NHGRI funded and I do not have a dbGaP PHS ID"
        final_results_dict["submittingToAnvil"] = duos_dict.get("submittingToAnvil")
        final_results_dict["dbGaPPhsID"] = duos_dict.get("dbGaPPhsID")
        final_results_dict["dbGaPStudyRegistrationName"] = duos_dict.get("dbGaPStudyRegistrationName")
        final_results_dict["embargoReleaseDate"] = duos_dict.get("embargoReleaseDate")
        final_results_dict["sequencingCenter"] = duos_dict.get("sequencingCenter")
        final_results_dict["piEmail"] = duos_dict.get("piEmail")
        final_results_dict["piInstitution"] = duos_dict.get("piInstitution")
        final_results_dict["nihGrantContractNumber"] = duos_dict.get("nihGrantContractNumber")
        final_results_dict["nihICsSupportingStudy"] = duos_dict.get("nihICsSupportingStudy")
        final_results_dict["nihProgramOfficerName"] = duos_dict.get("nihProgramOfficerName")
        final_results_dict["nihInstitutionCenterSubmission"] = duos_dict.get("nihInstitutionCenterSubmission")
        final_results_dict["nihInstitutionalCertificationFileName"] = duos_dict.get("nihInstitutionalCertificationFileName")
        final_results_dict["nihGenomicProgramAdministratorName"] = duos_dict.get("nihGenomicProgramAdministratorName")
        final_results_dict["multiCenterStudy"] = duos_dict.get("multiCenterStudy")
        final_results_dict["collaboratingSites"] = duos_dict.get("collaboratingSites")
        final_results_dict["controlledAccessRequiredForGenomicSummaryResultsGSR"] = duos_dict.get("controlledAccessRequiredForGenomicSummaryResultsGSR")
        final_results_dict["controlledAccessRequiredForGenomicSummaryResultsGSRRequiredExplanation"] = duos_dict.get("controlledAccessRequiredForGenomicSummaryResultsGSRRequiredExplanation")
        final_results_dict["alternativeDataSharingPlan"] = duos_dict.get("alternativeDataSharingPlan")
        final_results_dict["alternativeDataSharingPlanReasons"] = duos_dict.get("alternativeDataSharingPlanReasons")
        final_results_dict["alternativeDataSharingPlanExplanation"] = duos_dict.get("alternativeDataSharingPlanExplanation")
        final_results_dict["alternativeDataSharingPlanFileName"] = duos_dict.get("alternativeDataSharingPlanFileName")
        final_results_dict["alternativeDataSharingPlanDataSubmitted"] = duos_dict.get("alternativeDataSharingPlanDataSubmitted")
        final_results_dict["alternativeDataSharingPlanDataReleased"] = duos_dict.get("alternativeDataSharingPlanDataReleased")
        final_results_dict["alternativeDataSharingPlanTargetDeliveryDate"] = duos_dict.get("alternativeDataSharingPlanTargetDeliveryDate")
        final_results_dict["alternativeDataSharingPlanTargetPublicReleaseDate"] = duos_dict.get("alternativeDataSharingPlanTargetPublicReleaseDate")
        final_results_dict["alternativeDataSharingPlanAccessManagement"] = duos_dict.get("alternativeDataSharingPlanAccessManagement")
        final_results_dict["consentGroups.consentGroupName"] = duos_dict["consentGroups"][0].get("consentGroupName")
        final_results_dict["consentGroups.accessManagement"] = duos_dict["consentGroups"][0].get("accessManagement")
        final_results_dict["consentGroups.numberOfParticipants"] = duos_dict["consentGroups"][0].get("numberOfParticipants")
        final_results_dict["consentCode"] = consent_code
        final_results_dict["consentGroups.generalResearchUse"] = duos_dict["consentGroups"][0].get("generalResearchUse")
        final_results_dict["consentGroups.hmb"] = duos_dict["consentGroups"][0].get("hmb")
        final_results_dict["consentGroups.diseaseSpecificUse"] = duos_dict["consentGroups"][0].get("diseaseSpecificUse")
        final_results_dict["consentGroups.gs"] = duos_dict["consentGroups"][0].get("gs")
        final_results_dict["consentGroups.poa"] = duos_dict["consentGroups"][0].get("poa")
        final_results_dict["consentGroups.nmds"] = duos_dict["consentGroups"][0].get("nmds")
        final_results_dict["consentGroups.gso"] = duos_dict["consentGroups"][0].get("gso")
        final_results_dict["consentGroups.pub"] = duos_dict["consentGroups"][0].get("pub")
        final_results_dict["consentGroups.col"] = duos_dict["consentGroups"][0].get("col")
        final_results_dict["consentGroups.irb"] = duos_dict["consentGroups"][0].get("irb")
        final_results_dict["consentGroups.npu"] = duos_dict["consentGroups"][0].get("npu")
        final_results_dict["consentGroups.otherPrimary"] = duos_dict["consentGroups"][0].get("otherPrimary")
        final_results_dict["consentGroups.otherSecondary"] = duos_dict["consentGroups"][0].get("otherSecondary")
        final_results_dict["consentGroups.mor"] = duos_dict["consentGroups"][0].get("mor")
        final_results_dict["consentGroups.morDate"] = duos_dict["consentGroups"][0].get("morDate")
        final_results_dict["consentGroups.dataLocation"] = duos_dict["consentGroups"][0].get("dataLocation")
        final_results_dict["consentGroups.url"] = "https://data.terra.bio/snapshots/" + snapshot_id
        if duos_dict["consentGroups"][0]["fileTypes"] and duos_dict["consentGroups"][0]["fileTypes"].get("fileType"):
            final_results_dict["consentGroups.fileTypes.fileType"] = duos_dict["consentGroups"][0]["fileTypes"][0].get("fileType")
        else:
            final_results_dict["consentGroups.fileTypes.fileType"] = None
        if duos_dict["consentGroups"][0]["fileTypes"] and duos_dict["consentGroups"][0]["fileTypes"].get("functionalEquivalence"):
            final_results_dict["consentGroups.fileTypes.functionalEquivalence"] = duos_dict["consentGroups"][0]["fileTypes"][0].get("functionalEquivalence")
        else:
            final_results_dict["consentGroups.fileTypes.functionalEquivalence"] = None
        final_results_dict["consortium"] = consortium
    else:
        if dbGaPPhsID:
            final_results_dict["studyName"] = studyName + f" ({dbGaPPhsID})"
        else:
            final_results_dict["studyName"] = studyName
        final_results_dict["studyType"] = coalesce(dbgap_fhir_dict.get("studyType"), dbgap_xml_dict.get("studyType"), dbgap_study_api_dict.get("studyType"), terra_dict.get("studyType"))
        final_results_dict["studyDescription"] = format_description(coalesce(dbgap_fhir_dict.get("studyDescription"), dbgap_xml_dict.get("studyDescription"), dbgap_study_api_dict.get("studyDescription"), terra_dict.get("studyDescription")))
        if final_results_dict["studyDescription"]:
            final_results_dict["studyDescription"] = final_results_dict["studyDescription"] + "\nPlatform: AnVIL"
        else:
            final_results_dict["studyDescription"] = "Platform: AnVIL"
        final_results_dict["dataTypes"] = coalesce(terra_dict.get("dataTypes"), dbgap_fhir_dict.get("dataTypes"), dbgap_xml_dict.get("dataTypes"), dbgap_study_api_dict.get("dataTypes"))
        final_results_dict["phenotypeIndication"] = coalesce(terra_dict.get("phenotypeIndication"), dbgap_fhir_dict.get("phenotypeIndication"), dbgap_xml_dict.get("phenotypeIndication"), dbgap_study_api_dict.get("phenotypeIndication"))
        final_results_dict["species"] = "Human"
        final_results_dict["piName"] = coalesce(dbgap_fhir_dict.get("piName"), dbgap_xml_dict.get("piName"), dbgap_study_api_dict.get("piName"), terra_dict.get("piName"), "None")
        final_results_dict["dataCustodianEmail"] = ["help@lists.anvilproject.org"]
        final_results_dict["publicVisibility"] = True
        final_results_dict["nihAnvilUse"] = "I am NHGRI funded and I have a dbGaP PHS ID already" if dbGaPPhsID else "I am NHGRI funded and I do not have a dbGaP PHS ID"
        final_results_dict["submittingToAnvil"] = True
        final_results_dict["dbGaPPhsID"] = dbGaPPhsID
        final_results_dict["dbGaPStudyRegistrationName"] = coalesce(dbgap_fhir_dict.get("dbGaPStudyRegistrationName"), dbgap_xml_dict.get("dbGaPStudyRegistrationName"), dbgap_study_api_dict.get("dbGaPStudyRegistrationName"), terra_dict.get("dbGaPStudyRegistrationName"))
        final_results_dict["embargoReleaseDate"] = coalesce(dbgap_fhir_dict.get("embargoReleaseDate"), dbgap_xml_dict.get("embargoReleaseDate"), dbgap_study_api_dict.get("embargoReleaseDate"), terra_dict.get("embargoReleaseDate"))
        final_results_dict["sequencingCenter"] = None
        final_results_dict["piEmail"] = coalesce(dbgap_fhir_dict.get("piEmail"), dbgap_xml_dict.get("piEmail"), dbgap_study_api_dict.get("piEmail"), terra_dict.get("piEmail"))
        final_results_dict["piInstitution"] = coalesce(dbgap_fhir_dict.get("piInstitution"), dbgap_xml_dict.get("piInstitution"), dbgap_study_api_dict.get("piInstitution"), terra_dict.get("piInstitution"))
        final_results_dict["nihGrantContractNumber"] = None
        final_results_dict["nihICsSupportingStudy"] = coalesce(dbgap_fhir_dict.get("nihICsSupportingStudy"), dbgap_xml_dict.get("nihICsSupportingStudy"), dbgap_study_api_dict.get("nihICsSupportingStudy"), terra_dict.get("nihICsSupportingStudy"))
        final_results_dict["nihProgramOfficerName"] = coalesce(dbgap_fhir_dict.get("nihProgramOfficerName"), dbgap_xml_dict.get("nihProgramOfficerName"), dbgap_study_api_dict.get("nihProgramOfficerName"), terra_dict.get("nihProgramOfficerName"))
        final_results_dict["nihInstitutionCenterSubmission"] = "NHGRI"
        final_results_dict["nihInstitutionalCertificationFileName"] = None
        final_results_dict["nihGenomicProgramAdministratorName"] = coalesce(dbgap_fhir_dict.get("nihGenomicProgramAdministratorName"), dbgap_xml_dict.get("nihGenomicProgramAdministratorName"), dbgap_study_api_dict.get("nihGenomicProgramAdministratorName"), terra_dict.get("nihGenomicProgramAdministratorName"))
        final_results_dict["multiCenterStudy"] = None
        final_results_dict["collaboratingSites"] = [consortium] if consortium else []
        final_results_dict["controlledAccessRequiredForGenomicSummaryResultsGSR"] = None
        final_results_dict["controlledAccessRequiredForGenomicSummaryResultsGSRRequiredExplanation"] = None
        final_results_dict["alternativeDataSharingPlan"] = False
        final_results_dict["alternativeDataSharingPlanReasons"] = []
        final_results_dict["alternativeDataSharingPlanExplanation"] = None
        final_results_dict["alternativeDataSharingPlanFileName"] = None
        final_results_dict["alternativeDataSharingPlanDataSubmitted"] = None
        final_results_dict["alternativeDataSharingPlanDataReleased"] = None
        final_results_dict["alternativeDataSharingPlanTargetDeliveryDate"] = None
        final_results_dict["alternativeDataSharingPlanTargetPublicReleaseDate"] = None
        final_results_dict["alternativeDataSharingPlanAccessManagement"] = None
        final_results_dict["consentGroups.consentGroupName"] = snapshot_name
        final_results_dict["consentGroups.accessManagement"] = access_management
        final_results_dict["consentGroups.numberOfParticipants"] = coalesce(terra_dict.get("consentGroups.numberOfParticipants"), dbgap_fhir_dict.get("consentGroups.numberOfParticipants"), dbgap_xml_dict.get("consentGroups.numberOfParticipants"), dbgap_study_api_dict.get("consentGroups.numberOfParticipants"), "0")
        final_results_dict["consentCode"] = consent_code
        final_results_dict["consentGroups.generalResearchUse"] = True if access_management == "controlled" and "GRU" in consent_code else False
        final_results_dict["consentGroups.hmb"] = True if access_management == "controlled" and "HMB" in consent_code else False
        if purl_doid:
            final_results_dict["consentGroups.diseaseSpecificUse"] = [purl_doid]
        else:
            final_results_dict["consentGroups.diseaseSpecificUse"] = [consent_code] if "DS-" in consent_code else []
        final_results_dict["consentGroups.gs"] = consent_code if access_management == "controlled" and "GS-" in consent_code else None
        final_results_dict["consentGroups.poa"] = True if access_management == "controlled" and "POA" in consent_code else False
        final_results_dict["consentGroups.nmds"] = True if access_management == "controlled" and "NMDS" in consent_code else False
        final_results_dict["consentGroups.gso"] = True if access_management == "controlled" and "GSO" in consent_code else False
        final_results_dict["consentGroups.pub"] = True if access_management == "controlled" and "PUB" in consent_code else False 
        final_results_dict["consentGroups.col"] = True if access_management == "controlled" and "COL" in consent_code else False
        final_results_dict["consentGroups.irb"] = True if access_management == "controlled" and "IRB" in consent_code else False
        final_results_dict["consentGroups.npu"] = True if access_management == "controlled" and "NPU" in consent_code else False
        final_results_dict["consentGroups.otherPrimary"] = consent_code if (consent_code and access_management == "controlled" and not (final_results_dict["consentGroups.generalResearchUse"] or final_results_dict["consentGroups.hmb"] or final_results_dict["consentGroups.diseaseSpecificUse"] or final_results_dict["consentGroups.gs"] or final_results_dict["consentGroups.poa"] or final_results_dict["consentGroups.nmds"] or final_results_dict["consentGroups.gso"] or final_results_dict["consentGroups.pub"] or final_results_dict["consentGroups.col"] or final_results_dict["consentGroups.irb"] or final_results_dict["consentGroups.npu"])) else None
        final_results_dict["consentGroups.otherSecondary"] = None
        final_results_dict["consentGroups.mor"] = None
        final_results_dict["consentGroups.morDate"] = None
        final_results_dict["consentGroups.dataLocation"] = "TDR Location"
        final_results_dict["consentGroups.url"] = "https://data.terra.bio/snapshots/" + snapshot_id
        final_results_dict["consentGroups.fileTypes.fileType"] = coalesce(terra_dict.get("consentGroups.fileTypes.fileType"), dbgap_fhir_dict.get("consentGroups.fileTypes.fileType"), dbgap_xml_dict.get("consentGroups.fileTypes.fileType"), dbgap_study_api_dict.get("consentGroups.fileTypes.fileType"))
        final_results_dict["consentGroups.fileTypes.functionalEquivalence"] = None
        final_results_dict["consortium"] = consortium
    
    # Return results
    return final_results_dict


#############################################
## Input Parameters
#############################################

# Specify the snapshots to pull data for:
snapshot_id_list = [
    'e2736891-a569-449e-8cbf-b7d0274b64d0',
]

# Specify a mapping from phs-consent to DOID for DS consent codes (replace "_" with "-" in consent first)
ds_consent_map = {
    'phs000298:DS-ASD': 'http://purl.obolibrary.org/obo/DOID_0060041',
    'phs000693:DS-BDIS': 'http://purl.obolibrary.org/obo/DOID_936',
    'phs000693:DS-EP': 'http://purl.obolibrary.org/obo/DOID_1826',
    'phs000744:DS-RD': 'http://purl.obolibrary.org/obo/DOID_15',
    'phs000744:DS-THAL-IRB': 'http://purl.obolibrary.org/obo/DOID_10241',
    'phs001222:DS-DRC-IRB-NPU': 'http://purl.obolibrary.org/obo/DOID_9351',
    'phs001227:DS-ATHSCL-IRB-MDS': 'http://purl.obolibrary.org/obo/DOID_1936',
    'phs001259:DS-CARD-MDS-GSO': 'http://purl.obolibrary.org/obo/DOID_1287',
    'phs001487:DS-CVD-IRB-COL-MDS': 'http://purl.obolibrary.org/obo/DOID_1287',
    'phs001489:DS-EAED-IRB-NPU-MDS': 'http://purl.obolibrary.org/obo/DOID_1826',
    'phs001489:DS-EAED-MDS': 'http://purl.obolibrary.org/obo/DOID_1826',
    'phs001489:DS-EARET-MDS': 'http://purl.obolibrary.org/obo/DOID_1826',
    'phs001489:DS-EP': 'http://purl.obolibrary.org/obo/DOID_1826',
    'phs001489:DS-EP-MDS': 'http://purl.obolibrary.org/obo/DOID_1826',
    'phs001489:DS-EP-NPU': 'http://purl.obolibrary.org/obo/DOID_1826',
    'phs001489:DS-EPASM-MDS': 'http://purl.obolibrary.org/obo/DOID_1826',
    'phs001489:DS-EPASM-MDS-RD': 'http://purl.obolibrary.org/obo/DOID_1826',
    'phs001489:DS-EPCOM-MDS-RD': 'http://purl.obolibrary.org/obo/DOID_1826',
    'phs001489:DS-EPI-ADULT-NPU-MDS': 'http://purl.obolibrary.org/obo/DOID_1826',
    'phs001489:DS-EPI-MULTI-MDS': 'http://purl.obolibrary.org/obo/DOID_1826',
    'phs001489:DS-EPSBA-MDS-RD': 'http://purl.obolibrary.org/obo/DOID_1826',
    'phs001489:DS-EPSBACID-MDS-RD': 'http://purl.obolibrary.org/obo/DOID_1826',
    'phs001489:DS-EPSBACID-NPU-MDS-RD': 'http://purl.obolibrary.org/obo/DOID_1826',
    'phs001489:DS-EPSBAID-MDS-RD': 'http://purl.obolibrary.org/obo/DOID_1826',
    'phs001489:DS-NSD-ADULTS-NPU-MDS': 'http://purl.obolibrary.org/obo/DOID_863',
    'phs001489:DS-NSD-NPU-MDS': 'http://purl.obolibrary.org/obo/DOID_863',
    'phs001506:DS-CVD-IRB': 'http://purl.obolibrary.org/obo/DOID_1287',
    'phs001592:DS-CVD': 'http://purl.obolibrary.org/obo/DOID_1287',
    'phs001642:DS-GI': 'http://purl.obolibrary.org/obo/DOID_77',
    'phs001642:DS-GI-18+-MDS': 'http://purl.obolibrary.org/obo/DOID_77',
    'phs001642:DS-GI-IRB-MDS': 'http://purl.obolibrary.org/obo/DOID_77',
    'phs001642:DS-GI,18+': 'http://purl.obolibrary.org/obo/DOID_77',
    'phs001642:DS-GID': 'http://purl.obolibrary.org/obo/DOID_77',
    'phs001642:DS-IBD': 'http://purl.obolibrary.org/obo/DOID_0050589',
    'phs001642:DS-IBD-MDS': 'http://purl.obolibrary.org/obo/DOID_0050589',
    'phs001676:DS-AONDD-IRB': 'http://purl.obolibrary.org/obo/DOID_0060041',
    'phs001740:DS-ASD-IRB': 'http://purl.obolibrary.org/obo/DOID_0060041',
    'phs001741:DS-ASD-IRB': 'http://purl.obolibrary.org/obo/DOID_0060041',
    'phs001766:DS-ASD': 'http://purl.obolibrary.org/obo/DOID_0060041',
    'phs001766:DS-ASD-IRB': 'http://purl.obolibrary.org/obo/DOID_0060041',
    'phs001871:DS-CAD-IRB-COL-NPU': 'http://purl.obolibrary.org/obo/DOID_3393',
    'phs001894:DS-EAC-PUB-GSO': 'http://purl.obolibrary.org/obo/DOID_1826',
    'phs002004:DS-ASD': 'http://purl.obolibrary.org/obo/DOID_0060041',
    'phs002032:DS-SMA-MDS': 'http://purl.obolibrary.org/obo/DOID_12377',
    'phs002041:DS-MLHLTH-MDS': 'http://purl.obolibrary.org/obo/DOID_150',
    'phs002041:DS-SCZ-MDS': 'http://purl.obolibrary.org/obo/DOID_5419',
    'phs002041:DS-SZRD-MDS': 'http://purl.obolibrary.org/obo/DOID_5419',
    'phs002042:DS-ASD-MDS-PUB': 'http://purl.obolibrary.org/obo/DOID_0060041',
    'phs002043:DS-ASD': 'http://purl.obolibrary.org/obo/DOID_0060041',
    'phs002044:DS-ASD-IRB': 'http://purl.obolibrary.org/obo/DOID_0060041',
    'phs002206:DS-PEDD-IRB': 'http://purl.obolibrary.org/obo/DOID_4',
    'phs002282:DS-CVDRF': 'http://purl.obolibrary.org/obo/DOID_1287',
    'phs002502:DS-ASD': 'http://purl.obolibrary.org/obo/DOID_0060041',
    'phs002502:DS-ASD-MDS': 'http://purl.obolibrary.org/obo/DOID_0060041',
    'phs002502:DS-ND': 'http://purl.obolibrary.org/obo/DOID_1289',
}

#############################################
## Execution
#############################################
dataset_details_records = []
for snapshot_id in snapshot_id_list:
    print(f"Processing snapshot_id: {snapshot_id}...")
    dataset_details = fetch_dataset_details(snapshot_id, ds_consent_map)
    dataset_details_records.append(dataset_details)
output = pd.DataFrame(dataset_details_records)
output_sorted = output.sort_values(by=["studyName", "consentGroups.consentGroupName"], ascending=[True, True], ignore_index=True)

#############################################
## Validation and Output
#############################################
# Create copy of dataframe for unique value validation
output_unique_val = output_sorted.copy()

# Convert study list fields to strings
list_fields = ["dataTypes", "dataCustodianEmail", "nihICsSupportingStudy", "collaboratingSites", "alternativeDataSharingPlanReasons"]
for field in list_fields:
    output_unique_val[field] = [try_join(l) for l in output_unique_val[field]]

# Get unique values per study-level field, by study
study_level_col_list = []
for col in output_unique_val.columns:
    if "consentGroups." not in col and col not in ["studyName", "snapshot_id", "consortium", "consentCode"]:
        study_level_col_list.append(col)
df_unique = output_unique_val.groupby("studyName")[study_level_col_list].nunique()
df_unique["unique_value_validation"] = df_unique.max(axis=1)
df_unique["unique_value_validation"] = ["Pass" if l <= 1 else "Fail" for l in df_unique["unique_value_validation"]]

# Create copy of dataframe for enum validation
output_enum_val = output_sorted.copy()

# Validate enum fields
output_enum_val["studyType"] = [val_study_type_enum(l) for l in output_enum_val["studyType"]]
output_enum_val["nihInstitutionCenterSubmission"] = [val_nih_inst_center_sub_enum(l) for l in output_enum_val["nihInstitutionCenterSubmission"]]
output_enum_val["nihICsSupportingStudy"] = [val_nih_ic_supp_study_enum(l) for l in output_enum_val["nihICsSupportingStudy"]]
output_enum_val["consentGroups.fileTypes.fileType"] = [val_file_type_enum(l) for l in output_enum_val["consentGroups.fileTypes.fileType"]]
study_enum_cols = ["studyType", "nihInstitutionCenterSubmission", "nihICsSupportingStudy"]
df_study_enum = output_enum_val.groupby("studyName")[study_enum_cols].sum()
df_study_enum["study_enum_value_validation"] = df_study_enum.max(axis=1)
df_study_enum["study_enum_value_validation"] = ["Pass" if l < 1 else "Fail" for l in df_study_enum["study_enum_value_validation"]]
consent_group_enum_cols = ["consentGroups.fileTypes.fileType"]
df_consent_group_enum = output_enum_val.groupby("consentGroups.consentGroupName")[consent_group_enum_cols].sum()
df_consent_group_enum["consent_group_enum_value_validation"] = df_consent_group_enum.max(axis=1)
df_consent_group_enum["consent_group_enum_value_validation"] = ["Pass" if l < 1 else "Fail" for l in df_consent_group_enum["consent_group_enum_value_validation"]]

# Join validation dataframes to original dataframe
output_sorted_validated = output_sorted.join(df_unique["unique_value_validation"], on="studyName", how="left")
output_sorted_validated = output_sorted_validated.join(df_study_enum["study_enum_value_validation"], on="studyName", how="left")
output_sorted_validated = output_sorted_validated.join(df_consent_group_enum["consent_group_enum_value_validation"], on="consentGroups.consentGroupName", how="left")

# Display outputs
print("----------------------------------------------------------------------------------------------------")
print("----------------------------------------------------------------------------------------------------")
print("Validated Metadata Output:")
display(output_sorted_validated.style.hide(axis="index"))
print("\n")
print("Unique Study Value Validation Results:")
df_unique.reset_index(inplace=True)
display(df_unique.style.hide(axis="index"))
print("\n")
print("Study Enum Value Validation Results:")
df_study_enum.reset_index(inplace=True)
display(df_study_enum.style.hide(axis="index"))
print("\n")
print("Consent Group Enum Value Validation Results:")
df_consent_group_enum.reset_index(inplace=True)
display(df_consent_group_enum.style.hide(axis="index"))


Processing snapshot_id: e2736891-a569-449e-8cbf-b7d0274b64d0...
	Snapshot PHS_ID: phs001642
	Source Workspace: AnVIL_CCDG_Broad_AI_IBD_Kugathasan_WGS
	Current DUOS ID: DUOS-000198
{'studyId': 62, 'studyName': 'Center for Common Disease Genomics [CCDG] - Autoimmune: Inflammatory Bowel Disease (IBD) Exomes and Genomes (phs001642)', 'studyType': None, 'studyDescription': "The National Human Genome Research Institute (NHGRI) has funded a collaborative large-scale genome sequencing effort to comprehensively identify rare risk and protective variants contributing to multiple common disease phenotypes. Called the Centers for Common Disease Genomics (CCDG), this initiative will explore a range of diseases with the ultimate goal of: undertaking variant discovery for enough different examples of disease architectures and study designs to better understand the general principles of the genomic architecture underlying common, complex inherited diseases; understanding how best to design rare varian

snapshot_id,duos_id,studyName,studyType,studyDescription,dataTypes,phenotypeIndication,species,piName,dataCustodianEmail,publicVisibility,nihAnvilUse,submittingToAnvil,dbGaPPhsID,dbGaPStudyRegistrationName,embargoReleaseDate,sequencingCenter,piEmail,piInstitution,nihGrantContractNumber,nihICsSupportingStudy,nihProgramOfficerName,nihInstitutionCenterSubmission,nihInstitutionalCertificationFileName,nihGenomicProgramAdministratorName,multiCenterStudy,collaboratingSites,controlledAccessRequiredForGenomicSummaryResultsGSR,controlledAccessRequiredForGenomicSummaryResultsGSRRequiredExplanation,alternativeDataSharingPlan,alternativeDataSharingPlanReasons,alternativeDataSharingPlanExplanation,alternativeDataSharingPlanFileName,alternativeDataSharingPlanDataSubmitted,alternativeDataSharingPlanDataReleased,alternativeDataSharingPlanTargetDeliveryDate,alternativeDataSharingPlanTargetPublicReleaseDate,alternativeDataSharingPlanAccessManagement,consentGroups.consentGroupName,consentGroups.accessManagement,consentGroups.numberOfParticipants,consentCode,consentGroups.generalResearchUse,consentGroups.hmb,consentGroups.diseaseSpecificUse,consentGroups.gs,consentGroups.poa,consentGroups.nmds,consentGroups.gso,consentGroups.pub,consentGroups.col,consentGroups.irb,consentGroups.npu,consentGroups.otherPrimary,consentGroups.otherSecondary,consentGroups.mor,consentGroups.morDate,consentGroups.dataLocation,consentGroups.url,consentGroups.fileTypes.fileType,consentGroups.fileTypes.functionalEquivalence,consortium,unique_value_validation,study_enum_value_validation,consent_group_enum_value_validation
e2736891-a569-449e-8cbf-b7d0274b64d0,DUOS-000198,Center for Common Disease Genomics [CCDG] - Autoimmune: Inflammatory Bowel Disease (IBD) Exomes and Genomes (phs001642),,"The National Human Genome Research Institute (NHGRI) has funded a collaborative large-scale genome sequencing effort to comprehensively identify rare risk and protective variants contributing to multiple common disease phenotypes. Called the Centers for Common Disease Genomics (CCDG), this initiative will explore a range of diseases with the ultimate goal of: undertaking variant discovery for enough different examples of disease architectures and study designs to better understand the general principles of the genomic architecture underlying common, complex inherited diseases; understanding how best to design rare variant studies for common disease; developing resources, informatics tools, and innovative approaches and technologies for multiple disease research communities and the wider biomedical research community. The initial focus of the CCDGs will be in cardiovascular disease (early-onset coronary artery disease, atrial fibrillation, hemorrhagic stroke), neuropsychiatric disease (epilepsy, autism), and autoimmune/inflammatory disease (type 1 diabetes, inflammatory bowel disease). The Broad Institute is one of four selected CCDG project centers. The overarching aim of the Inflammatory Bowel Disease (IBD) program is to define the full allelic spectrum of protein-altering variation in genes associated to IBD, and assess their role in both Crohn's Disease (CD) and Ulcerative Colitis (UC) risk. The whole genome sequencing data generated here is comprised of samples from US-based diverse populations including African American, Puerto-Rican, Caribean and Cuban origins. Platform: AnVIL",['Raw Sequencing data'],inflammatory bowel disease,Human,"Mark Daly, PhD",['help@lists.anvilproject.org'],True,I am NHGRI funded and I have a dbGaP PHS ID already,True,phs001642,Center for Common Disease Genomics [CCDG] - Autoimmune: Inflammatory Bowel Disease (IBD) Exomes and Genomes (phs001642),,,,0,Unknown,['NHGRI'],"Felsenfeld, Adam",NHGRI,,"Wetterstrand, Kris",,['CCDG'],,,True,[],,,,,,,,ANVIL_CCDG_Broad_AI_IBD_Kugathasan_WGS_20240113_ANV5_202401141350,CONTROLLED,1348,GRU,,,[],,,,,,,,,,,,,TDR_LOCATION,https://data.terra.bio/snapshots/e2736891-a569-449e-8cbf-b7d0274b64d0,,,CCDG,Pass,Pass,Pass




Unique Study Value Validation Results:


studyName,duos_id,studyType,studyDescription,dataTypes,phenotypeIndication,species,piName,dataCustodianEmail,publicVisibility,nihAnvilUse,submittingToAnvil,dbGaPPhsID,dbGaPStudyRegistrationName,embargoReleaseDate,sequencingCenter,piEmail,piInstitution,nihGrantContractNumber,nihICsSupportingStudy,nihProgramOfficerName,nihInstitutionCenterSubmission,nihInstitutionalCertificationFileName,nihGenomicProgramAdministratorName,multiCenterStudy,collaboratingSites,controlledAccessRequiredForGenomicSummaryResultsGSR,controlledAccessRequiredForGenomicSummaryResultsGSRRequiredExplanation,alternativeDataSharingPlan,alternativeDataSharingPlanReasons,alternativeDataSharingPlanExplanation,alternativeDataSharingPlanFileName,alternativeDataSharingPlanDataSubmitted,alternativeDataSharingPlanDataReleased,alternativeDataSharingPlanTargetDeliveryDate,alternativeDataSharingPlanTargetPublicReleaseDate,alternativeDataSharingPlanAccessManagement,unique_value_validation
Center for Common Disease Genomics [CCDG] - Autoimmune: Inflammatory Bowel Disease (IBD) Exomes and Genomes (phs001642),1,0,1,1,1,1,1,1,1,1,1,1,1,0,1,0,1,1,1,1,1,0,1,0,1,0,0,1,1,0,0,0,0,0,0,0,Pass




Study Enum Value Validation Results:


studyName,studyType,nihInstitutionCenterSubmission,nihICsSupportingStudy,study_enum_value_validation
Center for Common Disease Genomics [CCDG] - Autoimmune: Inflammatory Bowel Disease (IBD) Exomes and Genomes (phs001642),0,0,0,Pass




Consent Group Enum Value Validation Results:


consentGroups.consentGroupName,consentGroups.fileTypes.fileType,consent_group_enum_value_validation
ANVIL_CCDG_Broad_AI_IBD_Kugathasan_WGS_20240113_ANV5_202401141350,0,Pass


# Step 2: Load Reviewed Metadata into DUOS

In [None]:
#############################################
## Functions
#############################################

def format_list(input_list, min_items):
    if input_list:
        if isinstance(input_list, list):
            return input_list
        elif isinstance(input_list, str):
            return format_list(ast.literal_eval(input_list), min_items)
        else:
            return []
    else:
        if min_items > 0:
            i = 0
            temp_list = []
            while i < min_items:
                temp_list.append("Unknown")
                i += 1
            return temp_list
        else:
            return []
    
def format_file_types(ft_list, fe):
    if ft_list:
        output_list = []
        formatted_ft_list = format_list(ft_list, 0)
        for ft in formatted_ft_list:
            ft_dict = {"fileType": ft}
            if fe:
                ft_dict["functionalEquivalence"] = fe
            else:
                ft_dict["functionalEquivalence"] = "Unknown"
            output_list.append(ft_dict)
        return output_list
    else:
        return []
    
def upload_to_duos(input_file, token, env, dac_id):
    
    # Determine the target URL from the env variable
    if env == "prod":
        url = "https://consent.dsde-prod.broadinstitute.org"
    else:
        url = "https://consent.dsde-dev.broadinstitute.org"
    
    # Pull down specified file from the cloud
    results_log = []
    print(f"Downloading input file {input_file}...")
    try:
        input_df = pd.read_csv(input_file_gcs_path, delimiter = "\t", encoding='unicode_escape')
        input_df = input_df.astype(object).where(pd.notnull(input_df),None)
        input_df.fillna("",inplace=True)
        input_dict = input_df.to_dict(orient="records")
        results_log.append(["Input File Download", "Succeeded", ""])
    except Exception as e:
        msg = f"Error downloading input file ({input_file}): {str(e)}"
        results_log.append(["Input File Download", "Failed", msg])
        print(msg)
        return results_log

    # Pull a list of existing datasets and studies from DUOS and build lookup dicts
    print("Building study and dataset lookup dicts from DUOS...")
    try:
        datasets = requests.get(
            url=f"{url}/api/dataset/v2?asCustodian=false",
            headers={"Authorization": f"Bearer {token}"}
        ).json()
        study_lookup = {}
        for dataset_entry in datasets:
            if dataset_entry["study"].get("name"):
                if not study_lookup.get(dataset_entry["study"]["name"]):
                    study_lookup[dataset_entry["study"]["name"]] = dataset_entry["study"]["studyId"]
        dataset_lookup = {}
        for dataset_entry in datasets:
            if dataset_entry.get("name"):
                dataset_lookup[dataset_entry["name"]] = dataset_entry["dataSetId"]
        results_log.append(["DUOS Study and Dataset Lookup Dict Creation", "Succeeded", ""])
    except Exception as e:
        msg = f"Error building study and dataset lookups: {str(e)}"
        results_log.append(["DUOS Study and Dataset Lookup Dict Creation", "Failed", msg])
        print(msg)
        return results_log
    
    # Parse and build DUOS schema for inputted file
    print("Parsing input file and formatting into DUOS schema...")
    try:
        # Determine data submitter id
        response = requests.get(
            url=f"{url}/api/user/me",
            headers={"Authorization": f"Bearer {token}"}
        ).json()
        data_submitter_id = response["userId"]
        # Build dictionary for upload
        upload_dict = {}
        for input_entry in input_dict:
            snapshot_id = input_entry["snapshot_id"]
            study_name = input_entry["studyName"]
            consent_group_name = input_entry["consentGroups.consentGroupName"]
            access_type = input_entry["consentGroups.accessManagement"]
            print(f"Parsing and formatting metadata for snapshot {snapshot_id} from the input file. Target study is: {study_name}")
            study_id = study_lookup.get(study_name)
            dataset_id = dataset_lookup.get(consent_group_name)
            if study_id and dataset_id:
                consent_group_dict = {
                            "consentGroupName": consent_group_name,
                            "datasetId": dataset_id,
                            "numberOfParticipants": input_entry["consentGroups.numberOfParticipants"],
                            "dataLocation": input_entry["consentGroups.dataLocation"],
                            "url": input_entry["consentGroups.url"],
                            "fileTypes": []
                            #"fileTypes": format_file_types(input_entry["consentGroups.fileTypes.fileType"], input_entry["consentGroups.fileTypes.functionalEquivalence"]) --> Enumeration, exclude for now
                    }
            elif access_type == "open":
                consent_group_dict = {
                            "consentGroupName": consent_group_name,
                            "accessManagement": access_type,
                            "numberOfParticipants": input_entry["consentGroups.numberOfParticipants"],
                            "dataLocation": input_entry["consentGroups.dataLocation"],
                            "url": input_entry["consentGroups.url"],
                            "fileTypes": []
                            #"fileTypes": format_file_types(input_entry["consentGroups.fileTypes.fileType"], input_entry["consentGroups.fileTypes.functionalEquivalence"]) --> Enumeration, exclude for now
                    }
            else:
                consent_group_dict = {
                            "consentGroupName": consent_group_name,
                            "dataAccessCommitteeId": dac_id,
                            "accessManagement": access_type,
                            "numberOfParticipants": input_entry["consentGroups.numberOfParticipants"],
                            "generalResearchUse": input_entry["consentGroups.generalResearchUse"],
                            "hmb": input_entry["consentGroups.hmb"],
                            "diseaseSpecificUse": format_list(input_entry["consentGroups.diseaseSpecificUse"], 0),
                            "gs": input_entry["consentGroups.gs"],
                            "poa": input_entry["consentGroups.poa"],
                            "nmds": input_entry["consentGroups.nmds"],
                            "gso": input_entry["consentGroups.gso"],
                            "pub": input_entry["consentGroups.pub"],
                            "col": input_entry["consentGroups.col"],
                            "irb": input_entry["consentGroups.irb"],
                            "npu": input_entry["consentGroups.npu"],
                            #"otherPrimary": input_entry["consentGroups.otherPrimary"], --> Excluding for now, per JL's request
                            #"otherSecondary": input_entry["consentGroups.otherSecondary"], --> Excluding for now, per JL's request
                            #"mor": input_entry["consentGroups.mor"], --> Date formatting validation for morDate, exclude for now
                            #"morDate": input_entry["consentGroups.morDate"], --> Date formatting validation, exclude for now
                            "dataLocation": input_entry["consentGroups.dataLocation"],
                            "url": input_entry["consentGroups.url"],
                            "fileTypes": []
                            #"fileTypes": format_file_types(input_entry["consentGroups.fileTypes.fileType"], input_entry["consentGroups.fileTypes.functionalEquivalence"]) --> Enumeration, exclude for now
                    }
            study_dict = {}
            consent_group_list = []
            if study_name not in upload_dict.keys():
                consent_group_list.append(consent_group_dict)
                study_dict = {
                    "studyName": study_name,
                    #"studyType": input_entry["studyType"], --> Enumeration, exclude for now
                    "studyDescription": input_entry["studyDescription"],
                    "dataTypes": format_list(input_entry["dataTypes"], 1),
                    "phenotypeIndication": input_entry["phenotypeIndication"],
                    "species": input_entry["species"],
                    "piName": input_entry["piName"] if input_entry["piName"] else "NA",
                    "dataSubmitterUserId": data_submitter_id,
                    "dataCustodianEmail": format_list(input_entry["dataCustodianEmail"], 0),
                    "publicVisibility": input_entry["publicVisibility"],
                    "nihAnvilUse": input_entry["nihAnvilUse"],
                    "submittingToAnvil": input_entry["submittingToAnvil"],
                    "dbGaPPhsID": input_entry["dbGaPPhsID"],
                    "dbGaPStudyRegistrationName": input_entry["studyName"],
                    #"embargoReleaseDate": input_entry["embargoReleaseDate"], --> Date formatting validation, exclude for now
                    "sequencingCenter": input_entry["sequencingCenter"],
                    "piEmail": input_entry["piEmail"],
                    #"piInstitution": input_entry["piInstitution"], --> Integer ID for registered institutions, exclude for now
                    "piInstitution": 0,
                    "nihGrantContractNumber": "Unknown", # Required currently
                    "nihICsSupportingStudy": format_list(input_entry["nihICsSupportingStudy"], 0),
                    "nihProgramOfficerName": input_entry["nihProgramOfficerName"],
                    "nihInstitutionCenterSubmission": input_entry["nihInstitutionCenterSubmission"],
                    "nihInstitutionalCertificationFileName": input_entry["nihInstitutionalCertificationFileName"],
                    "nihGenomicProgramAdministratorName": input_entry["nihGenomicProgramAdministratorName"],
                    "collaboratingSites": format_list(input_entry["collaboratingSites"], 0),
                    "alternativeDataSharingPlan": input_entry["alternativeDataSharingPlan"],
                    "consentGroups": consent_group_list
                }
                upload_dict[study_name] = study_dict
            else:
                for consent_group in upload_dict[study_name]["consentGroups"]:
                    if consent_group["consentGroupName"] != consent_group_dict["consentGroupName"]:
                        consent_group_list.append(consent_group)
                consent_group_list.append(consent_group_dict)
                study_dict = {
                    "studyName": study_name,
                    #"studyType": upload_dict[study_name]["studyType"], --> Enumeration, exclude for now
                    "studyDescription": upload_dict[study_name]["studyDescription"],
                    "dataTypes": upload_dict[study_name]["dataTypes"],
                    "phenotypeIndication": upload_dict[study_name]["phenotypeIndication"],
                    "species": upload_dict[study_name]["species"],
                    "piName": upload_dict[study_name]["piName"] if upload_dict[study_name]["piName"] else "NA",
                    "dataSubmitterUserId": upload_dict[study_name]["dataSubmitterUserId"],
                    "dataCustodianEmail": upload_dict[study_name]["dataCustodianEmail"],
                    "publicVisibility": upload_dict[study_name]["publicVisibility"],
                    "nihAnvilUse": upload_dict[study_name]["nihAnvilUse"],
                    "submittingToAnvil": upload_dict[study_name]["submittingToAnvil"],
                    "dbGaPPhsID": upload_dict[study_name]["dbGaPPhsID"],
                    "dbGaPStudyRegistrationName": upload_dict[study_name]["studyName"],
                    #"embargoReleaseDate": upload_dict[study_name]["embargoReleaseDate"], --> Date formatting validation, exclude for now
                    "sequencingCenter": upload_dict[study_name]["sequencingCenter"],
                    "piEmail": upload_dict[study_name]["piEmail"],
                    #"piInstitution": upload_dict[study_name]["piInstitution"], --> Integer ID for registered institutions, exclude for now
                    "piInstitution": upload_dict[study_name]["piInstitution"],
                    "nihGrantContractNumber": upload_dict[study_name]["nihGrantContractNumber"],
                    "nihICsSupportingStudy": upload_dict[study_name]["nihICsSupportingStudy"],
                    "nihProgramOfficerName": upload_dict[study_name]["nihProgramOfficerName"],
                    "nihInstitutionCenterSubmission": upload_dict[study_name]["nihInstitutionCenterSubmission"],
                    "nihInstitutionalCertificationFileName": upload_dict[study_name]["nihInstitutionalCertificationFileName"],
                    "nihGenomicProgramAdministratorName": upload_dict[study_name]["nihGenomicProgramAdministratorName"],
                    "collaboratingSites": upload_dict[study_name]["collaboratingSites"],
                    "alternativeDataSharingPlan": upload_dict[study_name]["alternativeDataSharingPlan"],
                    "consentGroups": consent_group_list
                }
                upload_dict[study_name] = study_dict
        results_log.append(["Input File Formatting", "Succeeded", ""])
    except Exception as e:
        msg = f"Error parsing and formatting input file: {str(e)}"
        results_log.append(["Input File Formatting", "Failed", msg])
        print(msg)
        return results_log
    
    # Loop through studies and dataset to upload
    for study in upload_dict.keys():
        print(f"Uploading data for study {study} into DUOS")
        # For studies that don't exist in DUOS, create a new study
        if not study_lookup.get(study):
            print("Study does NOT currently exist in DUOS. Creating new study and dataset records...")
            try:
                new_study_response = requests.post(
                    url=f"{url}/api/dataset/v3",
                    headers={"Authorization": f"Bearer {token}"},
                    files = {
                        "dataset": json.dumps(upload_dict[study]),
                        "alternativeDataSharingPlan": "",
                        "consentGroups[0].nihInstitutionalCertificationFile": ""  
                    }
                ).json()
                if new_study_response.get("studyId"):
                    study_id = new_study_response["studyId"]
                    msg = f"Study registration succeeded! Study Id: {study_id}"
                    results_log.append([f"New Study Registration - {study}", "Succeeded", msg])
                    print(msg)
                else:
                    err_msg = new_study_response["message"]
                    msg = f"Study registration failed: {err_msg}"
                    results_log.append([f"New Study Registration - {study}", "Failed", msg])
                    print(msg)
            except Exception as e:
                msg = f"Study registration failed: {str(e)}"
                results_log.append([f"New Study Registration - {study}", "Failed", msg])
                print(msg)
                
        # For studies that already exist in DUOS, lookup the study ID and update the existing study
        else:
            print("Study currently exists in DUOS. Updating study and dataset records...")
            pass
            try:
                # Update study in DUOS
                study_id = study_lookup.get(study)
                update_study_response = requests.put(
                    url=f"{url}/api/dataset/study/{study_id}",
                    headers={"Authorization": f"Bearer {token}"},
                    files = {
                        "dataset": json.dumps(upload_dict[study]),
                        "alternativeDataSharingPlan": "",
                        "consentGroups[0].nihInstitutionalCertificationFile": ""  
                    }
                ).json()   
                if update_study_response.get("studyId"):
                    study_id = update_study_response["studyId"]
                    msg = f"Study registration succeeded! Study Id: {study_id}"
                    results_log.append([f"New Study Registration - {study}", "Succeeded", msg])
                    print(msg)
                else:
                    err_msg = update_study_response["message"]
                    msg = f"Study registration failed: {err_msg}"
                    results_log.append([f"New Study Registration - {study}", "Failed", msg])
                    print(msg)
            except Exception as e:
                msg = f"Study registration failed: {str(e)}"
                results_log.append([f"Study Registration Update - {study}", "Failed", msg])
                print(msg)
    
    # Return results
    return results_log


#############################################
## Input Parameters
#############################################

# Cloud path to file to process
input_file_gcs_path = "gs://fc-2a9eefc3-0302-427f-9ac3-82f078741c03/dataset_metadata/anvil_dataset_metadata_20240118.txt"

# User token (use gcloud auth print-access-token to get this)
token = "ya29.a0AfB_byA59-WdVXqNoRcXwz5AO9WYA4uh7VUXxE7geMG0T8_LZmrAZ6CzuDt7vuu9eyt5WlfpNPN5W4tjTXlC5MWqiYz_7Akvm2VPJlAHlpZEtYFLZbS8RvJWCJJaaB6ZMDQJ3sHeBxE2Rm2wn1W5IitiZE93KHfXcb7B4uDnTwaCgYKAf8SARMSFQHGX2MiYGdreccHTRdJyEkINT7G_Q0177"

# Environment
env = "dev"

# Target DAC identifier
dac_id = 2

#############################################
## Execution
#############################################

upload_results = upload_to_duos(input_file_gcs_path, token, env, dac_id)
df_results = pd.DataFrame(upload_results, columns = ["Item", "Status", "Message"])
print("\nUpload Results:")
display(df_results)


## Testing

In [None]:
input_file = "gs://fc-2a9eefc3-0302-427f-9ac3-82f078741c03/dataset_metadata/dataset_metadata_1.txt"
token = "ya29.a0AfB_byDTAjxdDOD3uehlSc89iGlnT3IvhteIM0J0XEn9xNE2tW75KXokeLNTpCzjE909nQuKy3eRgs-oQ4GM6UpwzkyzgdDEoA-a3N_2oFq4c1V_ER8z-QTwipH5Zz09w4k_H-t8vdUKotABaY0vJrfZeNBP3CRG2gFdSkWr7gaCgYKAR0SARMSFQHGX2Mifw3ecrSERdjCU16UthJy1g0177"

# Pull down specified file from the cloud
results_log = []
print(f"Downloading input file {input_file}...")
try:
    input_df = pd.read_csv(input_file_gcs_path, delimiter = "\t", encoding='unicode_escape')
    input_df = input_df.astype(object).where(pd.notnull(input_df),None)
    input_df.fillna("",inplace=True)
    input_dict = input_df.to_dict(orient="records")
    results_log.append(["Input File Download", "Succeeded", ""])
except Exception as e:
    msg = f"Error downloading input file ({input_file}): {str(e)}"
    results_log.append(["Input File Download", "Failed", msg])
    print(msg)
#     return results_log

# Pull a list of existing datasets and studies from DUOS and build lookup dicts
print("Building study and dataset lookup dicts from DUOS...")
try:
    datasets = requests.get(
        url=f"https://consent.dsde-dev.broadinstitute.org/api/dataset/v2?asCustodian=false",
        headers={"Authorization": f"Bearer {token}"}
    ).json()
    study_lookup = {}
    dataset_lookup = {}
    for dataset_entry in datasets:
        if dataset_entry["study"].get("name"):
            if not study_lookup.get(dataset_entry["study"]["name"]):
                study_lookup[dataset_entry["study"]["name"]] = dataset_entry["study"]["studyId"]
        if dataset_entry.get("name"):
            dataset_lookup[dataset_entry["name"]] = dataset_entry["dataSetId"]
    results_log.append(["DUOS Study and Dataset Lookup Dict Creation", "Succeeded", ""])
except Exception as e:
    msg = f"Error building study and dataset lookups: {str(e)}"
    results_log.append(["DUOS Study and Dataset Lookup Dict Creation", "Failed", msg])
    print(msg)
#     return results_log

# Parse and build DUOS schema for inputted file
print("Parsing input file and formatting into DUOS schema...")
try:
    # Determine data submitter id
    response = requests.get(
        url=f"https://consent.dsde-dev.broadinstitute.org/api/user/me",
        headers={"Authorization": f"Bearer {token}"}
    ).json()
    data_submitter_id = response["userId"]
    # Build dictionary for upload
    upload_dict = {}
    for input_entry in input_dict:
        snapshot_id = input_entry["snapshot_id"]
        study_name = input_entry["studyName"]
        consent_group_name = input_entry["consentGroups.consentGroupName"]
        print(f"Parsing and formatting metadata for snapshot {snapshot_id} from the input file. Target study is: {study_name}")
        study_id = study_lookup.get(study_name)
        dataset_id = dataset_lookup.get(consent_group_name)
        if study_id and dataset_id:
            consent_group_dict = {
                        "consentGroupName": consent_group_name,
                        "datasetId": dataset_id,
                        "numberOfParticipants": input_entry["consentGroups.numberOfParticipants"],
                        "dataLocation": input_entry["consentGroups.dataLocation"],
                        "url": input_entry["consentGroups.url"],
                        "fileTypes": []
                        #"fileTypes": format_file_types(input_entry["consentGroups.fileTypes.fileType"], input_entry["consentGroups.fileTypes.functionalEquivalence"]) --> Enumeration, exclude for now
                }
        else:
            consent_group_dict = {
                        "consentGroupName": consent_group_name,
                        "dataAccessCommitteeId": 3,
                        "accessManagement": input_entry["consentGroups.accessManagement"],
                        "numberOfParticipants": input_entry["consentGroups.numberOfParticipants"],
                        "generalResearchUse": input_entry["consentGroups.generalResearchUse"],
                        "hmb": input_entry["consentGroups.hmb"],
                        "diseaseSpecificUse": format_list(input_entry["consentGroups.diseaseSpecificUse"], 0),
                        "gs": input_entry["consentGroups.gs"],
                        "poa": input_entry["consentGroups.poa"],
                        "nmds": input_entry["consentGroups.nmds"],
                        "gso": input_entry["consentGroups.gso"],
                        "pub": input_entry["consentGroups.pub"],
                        "col": input_entry["consentGroups.col"],
                        "irb": input_entry["consentGroups.irb"],
                        "npu": input_entry["consentGroups.npu"],
                        "otherPrimary": input_entry["consentGroups.otherPrimary"],
                        "otherSecondary": input_entry["consentGroups.otherSecondary"],
                        #"mor": input_entry["consentGroups.mor"], --> Date formatting validation for morDate, exclude for now
                        #"morDate": input_entry["consentGroups.morDate"], --> Date formatting validation, exclude for now
                        "dataLocation": input_entry["consentGroups.dataLocation"],
                        "url": input_entry["consentGroups.url"],
                        "fileTypes": []
                        #"fileTypes": format_file_types(input_entry["consentGroups.fileTypes.fileType"], input_entry["consentGroups.fileTypes.functionalEquivalence"]) --> Enumeration, exclude for now
                }
        study_dict = {}
        consent_group_list = []
        if study_name not in upload_dict.keys():
            consent_group_list.append(consent_group_dict)
            study_dict = {
                "studyName": study_name,
                #"studyType": input_entry["studyType"], --> Enumeration, exclude for now
                "studyDescription": input_entry["studyDescription"],
                "dataTypes": format_list(input_entry["dataTypes"], 1),
                "phenotypeIndication": input_entry["phenotypeIndication"],
                "species": input_entry["species"],
                "piName": input_entry["piName"],
                "dataSubmitterUserId": data_submitter_id,
                "dataCustodianEmail": format_list(input_entry["dataCustodianEmail"], 0),
                "publicVisibility": input_entry["publicVisibility"],
                "nihAnvilUse": input_entry["nihAnvilUse"],
                "submittingToAnvil": input_entry["submittingToAnvil"],
                "dbGaPPhsID": input_entry["dbGaPPhsID"],
                "dbGaPStudyRegistrationName": input_entry["studyName"],
                #"embargoReleaseDate": input_entry["embargoReleaseDate"], --> Date formatting validation, exclude for now
                "sequencingCenter": input_entry["sequencingCenter"],
                "piEmail": input_entry["piEmail"],
                #"piInstitution": input_entry["piInstitution"], --> Integer ID for registered institutions, exclude for now
                "piInstitution": 0,
                "nihGrantContractNumber": "Unknown", # Required currently
                "nihICsSupportingStudy": format_list(input_entry["nihICsSupportingStudy"], 0),
                "nihProgramOfficerName": input_entry["nihProgramOfficerName"],
                "nihInstitutionCenterSubmission": input_entry["nihInstitutionCenterSubmission"],
                "nihInstitutionalCertificationFileName": input_entry["nihInstitutionalCertificationFileName"],
                "nihGenomicProgramAdministratorName": input_entry["nihGenomicProgramAdministratorName"],
                "collaboratingSites": format_list(input_entry["collaboratingSites"], 0),
                "alternativeDataSharingPlan": input_entry["alternativeDataSharingPlan"],
                "consentGroups": consent_group_list
            }
            upload_dict[study_name] = study_dict
        else:
            for consent_group in upload_dict[study_name]["consentGroups"]:
                if consent_group["consentGroupName"] != consent_group_dict["consentGroupName"]:
                    consent_group_list.append(consent_group)
            consent_group_list.append(consent_group_dict)
            study_dict = {
                "studyName": study_name,
                #"studyType": upload_dict[study_name]["studyType"], --> Enumeration, exclude for now
                "studyDescription": upload_dict[study_name]["studyDescription"],
                "dataTypes": upload_dict[study_name]["dataTypes"],
                "phenotypeIndication": upload_dict[study_name]["phenotypeIndication"],
                "species": upload_dict[study_name]["species"],
                "piName": upload_dict[study_name]["piName"],
                "dataSubmitterUserId": upload_dict[study_name]["dataSubmitterUserId"],
                "dataCustodianEmail": upload_dict[study_name]["dataCustodianEmail"],
                "publicVisibility": upload_dict[study_name]["publicVisibility"],
                "nihAnvilUse": upload_dict[study_name]["nihAnvilUse"],
                "submittingToAnvil": upload_dict[study_name]["submittingToAnvil"],
                "dbGaPPhsID": upload_dict[study_name]["dbGaPPhsID"],
                "dbGaPStudyRegistrationName": upload_dict[study_name]["studyName"],
                #"embargoReleaseDate": upload_dict[study_name]["embargoReleaseDate"], --> Date formatting validation, exclude for now
                "sequencingCenter": upload_dict[study_name]["sequencingCenter"],
                "piEmail": upload_dict[study_name]["piEmail"],
                #"piInstitution": upload_dict[study_name]["piInstitution"], --> Integer ID for registered institutions, exclude for now
                "piInstitution": upload_dict[study_name]["piInstitution"],
                "nihGrantContractNumber": upload_dict[study_name]["nihGrantContractNumber"],
                "nihICsSupportingStudy": upload_dict[study_name]["nihICsSupportingStudy"],
                "nihProgramOfficerName": upload_dict[study_name]["nihProgramOfficerName"],
                "nihInstitutionCenterSubmission": upload_dict[study_name]["nihInstitutionCenterSubmission"],
                "nihInstitutionalCertificationFileName": upload_dict[study_name]["nihInstitutionalCertificationFileName"],
                "nihGenomicProgramAdministratorName": upload_dict[study_name]["nihGenomicProgramAdministratorName"],
                "collaboratingSites": upload_dict[study_name]["collaboratingSites"],
                "alternativeDataSharingPlan": upload_dict[study_name]["alternativeDataSharingPlan"],
                "consentGroups": consent_group_list
            }
            upload_dict[study_name] = study_dict
    results_log.append(["Input File Formatting", "Succeeded", ""])
except Exception as e:
    msg = f"Error parsing and formatting input file: {str(e)}"
    results_log.append(["Input File Formatting", "Failed", msg])
    print(msg)
#     return results_log

# Loop through studies and dataset to upload
for study in upload_dict.keys():
    print(f"Uploading data for study {study} into DUOS")
    # For studies that don't exist in DUOS, create a new study
    if not study_lookup.get(study):
        print("Study does NOT currently exist in DUOS. Creating new study and dataset records...")
        try:
            new_study_response = requests.post(
                url=f"https://consent.dsde-dev.broadinstitute.org/api/dataset/v3",
                headers={"Authorization": f"Bearer {token}"},
                files = {
                    "dataset": json.dumps(upload_dict[study]),
                    "alternativeDataSharingPlan": "",
                    "consentGroups[0].nihInstitutionalCertificationFile": ""  
                }
            ).json()
            if new_study_response.get("studyId"):
                study_id = new_study_response["studyId"]
                msg = f"Study registration succeeded! Study Id: {study_id}"
                results_log.append([f"New Study Registration - {study}", "Succeeded", msg])
                print(msg)
            else:
                err_msg = new_study_response["message"]
                msg = f"Study registration failed: {err_msg}"
                results_log.append([f"New Study Registration - {study}", "Failed", msg])
                print(msg)
        except Exception as e:
            msg = f"Study registration failed: {str(e)}"
            results_log.append([f"New Study Registration - {study}", "Failed", msg])
            print(msg)

    # For studies that already exist in DUOS, lookup the study ID and update the existing study
    else:
        print("Study currently exists in DUOS. Updating study and dataset records...")
        try:
            # Add dataset IDs for existing datasets to avoid validation failures
            temp_dict = upload_dict[study].copy()
#                 updated_consent_group_list = []
#                 for consent_group in temp_dict["consentGroups"]:
#                     if consent_group["consentGroupName"] in dataset_lookup.keys():
#                         temp_cg = consent_group.copy()
#                         temp_cg["datasetId"] = dataset_lookup.get(consent_group["consentGroupName"])
#                         updated_consent_group_list.append(temp_cg)
#                     else:
#                         updated_consent_group_list.append(consent_group)
#                 temp_dict["consentGroups"] = updated_consent_group_list
            # Update study in DUOS
            study_id = study_lookup.get(study)
            update_study_response = requests.put(
                url=f"https://consent.dsde-dev.broadinstitute.org/api/dataset/study/{study_id}",
                headers={"Authorization": f"Bearer {token}"},
                files = {
                    "dataset": json.dumps(temp_dict),
                    "alternativeDataSharingPlan": "",
                    "consentGroups[0].nihInstitutionalCertificationFile": ""  
                }
            ).json()   
            if update_study_response.get("studyId"):
                study_id = update_study_response["studyId"]
                msg = f"Study registration succeeded! Study Id: {study_id}"
                results_log.append([f"New Study Registration - {study}", "Succeeded", msg])
                print(msg)
            else:
                err_msg = update_study_response["message"]
                msg = f"Study registration failed: {err_msg}"
                results_log.append([f"New Study Registration - {study}", "Failed", msg])
                print(msg)
        except Exception as e:
            msg = f"Study registration failed: {str(e)}"
            results_log.append([f"Study Registration Update - {study}", "Failed", msg])
            print(msg)

In [None]:
update_study_response

# Step 3: Attach DUOS IDs to Snapshots

## Add DUOS IDs Based on Snapshot Listed in DUOS

In [None]:
#############################################
## Functions
#############################################

def link_duos_ids_to_snapshots(snapshot_id_list, env, token):
    results_log = []

    # Determine the target URL from the env variable
    if env == "prod":
        url = "https://consent.dsde-prod.broadinstitute.org"
    else:
        url = "https://consent.dsde-dev.broadinstitute.org"

    # Pull a list of existing datasets and studies from DUOS and build lookup dicts
    print("Building lookup between Snapshot and DUOS ID...")
    try:
        datasets = requests.get(
            url=f"{url}/api/dataset/v2?asCustodian=false",
            headers={"Authorization": f"Bearer {token}"}
        ).json()
        snapshot_lookup = {}
        for dataset_entry in datasets:
            url = ""
            snapshot = False
            for prop_entry in dataset_entry["properties"]:
                if prop_entry["propertyName"] == "URL":
                    url = prop_entry["propertyValue"]
                elif prop_entry["propertyName"] == "Data Location" and prop_entry["propertyValue"] == "TDR Location":
                    snapshot = True
            if snapshot == True:
                snapshot_id = re.search("([a-z0-9]{8}-[a-z0-9]{4}-[a-z0-9]{4}-[a-z0-9]{4}-[a-z0-9]{12})", url, re.IGNORECASE).group(1)
                duos_id = dataset_entry["datasetIdentifier"]
                snapshot_lookup[snapshot_id] = duos_id
        results_log.append(["Snapshot Lookup Creation", "Succeeded", ""])
    except Exception as e:
        msg = f"Error building lookup between Snapshot and DUOS ID: {str(e)}"
        results_log.append(["Snapshot Lookup Creation", "Failed", msg])
        print(msg)
        return results_log

    # Loop through input snapshots and link DUOS IDs to them
    print("Linking DUOS IDs to Snapshots...")
    api_client = refresh_tdr_api_client()
    snapshots_api = data_repo_client.SnapshotsApi(api_client=api_client)
    for snapshot_id in snapshot_id_list:
        print(f"\tProcessing snapshot ID = {snapshot_id}")
        duos_id = snapshot_lookup.get(snapshot_id)
        if duos_id:
            attempt_counter = 0
            while attempt_counter <= 2:
                try:
                    response = snapshots_api.link_duos_dataset_to_snapshot(id=snapshot_id, duos_id=duos_id).to_dict()
                    if response.get("linked"):
                        results_log.append([f"DUOS ID to Snapshot Linkage ({snapshot_id} - {duos_id})", "Success", ""])
                        break
                    elif response.get("message"):
                        response_message = response.get("message")
                        msg = f"Error linking DUOS ID to Snapshot: {response_message}"
                        if attempt_counter >= 2:
                            results_log.append([f"DUOS ID to Snapshot Linkage ({snapshot_id} - {duos_id})", "Failed", msg])
                            break
                except Exception as e:
                    msg = f"Error linking DUOS ID to Snapshot: {str(e)}"
                    if attempt_counter >= 2:
                        results_log.append([f"DUOS ID to Snapshot Linkage ({snapshot_id} - {duos_id})", "Failed", msg])
                    sleep(5)
                    attempt_counter += 1  
        else:
            results_log.append([f"DUOS ID to Snapshot Linkage ({snapshot_id})", "Failed", "No DUOS ID found for the snapshot."])
    return results_log

#############################################
## Input Parameters
#############################################

# User token (use gcloud auth print-access-token to get this)
token = "ya29.a0AfB_byClR4y1hu-puSsus_6GAqS1QIjjVBwqD4nGzTsztXyBjJPI3sqMV2XZAeWwZRMuzfok1z0Wi7DSgVQQXCu2NWLmoWtThjLPCP0L-XsFxWpcQd-3wvqkyDKksRYANcvM8JU-OAFI_1QGGG10izNsZNqjVRkBoRS5FxvtK10aCgYKAV8SARMSFQHGX2MicJfxXq-2q_73f9tt0uv0Sw0178"

# Environment
env = "prod"

# Snapshot list
snapshot_id_list = [
    '737d454c-88be-477f-ae2c-ef473e2106ce',
    '253e2b36-1674-482b-bfbd-4e0b05cdfe63',
    '3f53e841-ca9d-4b55-b390-590718533561',
    '01cf2450-604b-43e5-9f4e-9ec4e0bf0a61',
    '85b0b351-cd0a-4efe-95a4-e39273c42831',
    'c9037419-367e-439c-a247-b0dae7c24146',
    'd7b2b2c6-72fd-4084-af34-a86edfe3ac47',
    'd63a63ce-24c8-413a-89c0-4bd4c82370c0',
    '1bb208f2-ecf3-4589-a9bd-b6e94178584d',
    '5773565d-ad7c-4f51-8b4f-f1ee5dffc08a',
    '2e5c5fe3-3af4-4c34-a85e-af6b4135f089',
    '27068295-b3c0-4260-9447-9ca96814d46f',
    '060c707a-2f0d-4730-bbd6-d25489abfcf6',
    '7e59197f-b859-4279-add3-de24bbc7e52b',
    '624fef99-e4ce-4c12-a3d9-90995b5da970',
    'a68d3145-81c2-41f8-9944-5e4a5058934a',
    'a3b18d45-96c2-4526-8fde-65ab3265868f',
    '3ec72891-87d2-431f-850c-e52013330ea8',
    '87d02347-d169-4ce0-9027-3c8e11e48c40',
    '61b6ae23-ca19-4d31-bad3-2281a8528886',
    '7c4edc65-bfe6-4ede-a68a-c0b9d2564f29',
    'f330517e-46fd-4de3-8063-015b524a7324',
    'f0d8bb27-1695-4faf-8b27-4b95260b8f17',
    '17d14df1-cb64-4aae-8049-c1728a3c0c81',
    '434f85e2-4435-483c-8099-b03c8ba794ed',
    '5bba97dc-d6ab-4329-912f-148c8b807056',
    '4c722626-c559-4f5a-84bd-8d7d46983e1e',
    '6df525e1-b143-4e6f-b667-80c783ae1b66',
    '079eb53c-e2b6-4da6-ab5f-fc2136a3ecc1',
    '1a26532c-16e6-4f1c-81f9-8f07a8181421',
    '3ac713b5-3645-4381-ac66-ecbc281a2ab8',
    '4911bd18-5db9-418a-9dc0-0ea28ae937d6',
    'bbd04481-0b9d-4c21-ba65-a43638116e0f',
    '2b78a3ac-8bca-4938-bc7c-26a60f9c04ac',
    '4bb891fc-fcae-40cc-bf59-73716de7e04e',
    '574e0d42-e712-4a86-be7a-4b3a95187bcd',
    '56078c29-a393-4c60-9e04-3674e02fe729',
    '099d2585-1379-4333-b3b1-ffc0d26d95c5',
    'ab71d294-4ba9-44d4-8051-913b3d5ccff3',
    '90fe2016-e79c-456c-a5f9-3a31149fcd65',
    'e43974fd-cee1-4d8c-a436-6846d7d24129',
    '0d607d21-c9c7-4852-83e3-76825176ee0a',
    '0a356156-961d-4829-b9b5-c07fbc73dacc',
    '18a28450-31ec-4e4a-a305-dbbdd226ae3c',
    'f7d225d9-1675-483d-a1eb-9ef750301cd4',
    'd4b02f5f-7a62-4cad-8ffc-d3deb0fab445',
    '4c8ce027-8094-4f5d-bf62-22b1d51b3c1e',
    'c753046a-cf9b-4813-be68-cb3b9dd9866e',
    'eb7948be-1007-4b0e-b9b6-a5c40bbb9596',
    '7639a9e0-275c-49a8-80c1-cdb01ce23e1c',
    'aa2bfacc-c28c-4192-960c-b1389cf68516',
    'd7349942-f8ff-4ad6-b075-8f39652a7789',
    'b9e0de2a-4085-4226-a073-1744914cbbd4',
    '44b1f60b-e74c-4430-9378-d4a75e2de72f',
    'a4c62d7f-34f0-4e2e-9e46-c762d3ab0ff2',
    '6a5b3be6-d1de-4f23-a431-b08e7ab231b8',
    '5208772d-21f9-46b0-8167-0b05b57296b8',
    '36690013-e8bc-43a5-9ba9-83317537557c',
    '172bada7-f1c5-41c4-836d-05381beaed9a',
    '9a1e873b-b1db-4d3e-a83b-ed6c5b3f3ecc',
    '2c6de04e-104d-42c8-8448-97d74985dacb',
    '452bcafd-ab45-4e24-b5e0-13fcf22b0755',
    'fbafdd31-21a0-44c5-ae4d-724839beff61',
    '2a1882d9-88ca-4849-bcc1-f6914f593407',
    '3838993f-59ba-4dec-8110-ac3ea387ab91',
    'bf2f4106-cee9-419c-b4d1-d7b03a6293d5',
    'a6c36f5e-b86c-4164-85ae-8bf0df2e4a90',
    '11a7572f-02b9-4f88-8c2c-802dfb1f94b7',
    '5e547934-c339-410e-a013-dfefed50f4b8',
    'ffa84feb-ca0e-43d3-a04d-a402a8e24a3b',
    '2be072bd-2153-4050-9358-e4b95297a9bf',
    '7c19d852-e36a-4353-afea-10e501601d9a',
    'fd3843fe-ee5d-4784-b0d2-6673f9886d30',
    '84703c54-a9dd-400c-9701-2fc40922e3e3',
    '00297802-e20a-413f-b389-a6f764b6600e',
    'c853d4c0-d4be-433d-964e-e30bdc35480e',
    '3e85b06a-a6ea-4ce8-a655-44b1fce12138',
    '6e674477-522f-4adc-8c50-76910a6a282b',
    '504089f1-c59d-48fe-84ef-858bd3eb3043',
    '0565b2e4-ade1-46e7-80bf-ca647a89a8b8',
    '1cf943bc-9ffe-4fd0-a92d-6fdcf68da743',
    'bb11d621-e471-4ca9-b9ae-cf06c99db297',
    '7b875b4b-a6c5-4c92-a252-cd5ff203089e',
    '97b3d565-3c32-4fd5-be49-c16f0bae84e7',
    'ea08adf0-2383-41ae-a91a-88c7b8f6f42b',
    '5b8c745a-972b-455c-8021-ee24fdbce9a5',
    'bebf0200-8458-4467-b001-ff436564e942',
    '1c16f983-c090-457a-aca7-4181d16e225b',
    'b259ac6c-3358-4faa-abfe-c9d614b76915',
    '1a119cfe-3178-4f06-800b-b2aec50218b8',
    '33c73ae8-f829-438d-bdb1-da0be8f3773f',
    '3d6afb8e-dbcd-4972-8281-ae546b23356c',
    '42fd7b4a-461d-4a4f-bb02-856e7124dce1',
    '08f28ada-3fa1-41f3-a7eb-5b4ff8325145',
    '189a0802-8538-41f8-ad51-8bb2a736783b',
    'e0dc36c3-ff48-4ab5-881f-899578e08dd4',
    '9052b5fc-8ac8-41ea-8a82-6860b8d2c33d',
    'b8bc131f-68d6-4c56-bd37-55c1b0e27d2e',
]

#############################################
## Execution
#############################################

results = link_duos_ids_to_snapshots(snapshot_id_list, env, token)
df_results = pd.DataFrame(results, columns = ["Item", "Status", "Message"])
print("\nLinking Results:")
display(df_results)


## Add DUOS IDs Directly

In [2]:
#############################################
## Functions
#############################################

def direct_link_duos_ids_to_snapshots(snapshot_duos_list):
    results_log = []

    # Loop through input snapshots and link DUOS IDs to them
    print("Linking DUOS IDs to Snapshots...")
    api_client = refresh_tdr_api_client()
    snapshots_api = data_repo_client.SnapshotsApi(api_client=api_client)
    for entry in snapshot_duos_list:
        snapshot_id = entry[0] 
        duos_id = entry[1]
        print(f"\tProcessing snapshot ID = {snapshot_id}")
        if duos_id:
            attempt_counter = 0
            while attempt_counter <= 2:
                try:
                    response = snapshots_api.link_duos_dataset_to_snapshot(id=snapshot_id, duos_id=duos_id).to_dict()
                    if response.get("linked"):
                        results_log.append([f"DUOS ID to Snapshot Linkage ({snapshot_id} - {duos_id})", "Success", ""])
                        break
                    elif response.get("message"):
                        response_message = response.get("message")
                        msg = f"Error linking DUOS ID to Snapshot: {response_message}"
                        if attempt_counter >= 2:
                            results_log.append([f"DUOS ID to Snapshot Linkage ({snapshot_id} - {duos_id})", "Failed", msg])
                            break
                except Exception as e:
                    msg = f"Error linking DUOS ID to Snapshot: {str(e)}"
                    if attempt_counter >= 2:
                        results_log.append([f"DUOS ID to Snapshot Linkage ({snapshot_id} - {duos_id})", "Failed", msg])
                    sleep(5)
                    attempt_counter += 1  
        else:
            results_log.append([f"DUOS ID to Snapshot Linkage ({snapshot_id})", "Failed", "No DUOS ID found for the snapshot."])
    return results_log

#############################################
## Input Parameters
#############################################

# Snapshot list
snapshot_duos_list = [
    #['snapshot_id', 'duos_id']
    ['74cb5b41-63ac-45b3-a9cc-18fcbbaccb3b', 'DUOS-000159'],
    ['e66e025f-e07c-4f0d-93ed-3ac609b570d5', 'DUOS-000186'],
]

#############################################
## Execution
#############################################

results = direct_link_duos_ids_to_snapshots(snapshot_duos_list)
df_results = pd.DataFrame(results, columns = ["Item", "Status", "Message"])
print("\nLinking Results:")
display(df_results)

Linking DUOS IDs to Snapshots...
	Processing snapshot ID = 74cb5b41-63ac-45b3-a9cc-18fcbbaccb3b
	Processing snapshot ID = e66e025f-e07c-4f0d-93ed-3ac609b570d5

Linking Results:


Unnamed: 0,Item,Status,Message
0,DUOS ID to Snapshot Linkage (74cb5b41-63ac-45b3-a9cc-18fcbbaccb3b - DUOS-000159),Success,
1,DUOS ID to Snapshot Linkage (e66e025f-e07c-4f0d-93ed-3ac609b570d5 - DUOS-000186),Success,


# Script Development

## Fetch parameters from snapshot/dataset

In [None]:
# Parameters
snapshot_id = "099d2585-1379-4333-b3b1-ffc0d26d95c5"

# Retrieve snapshot details
api_client = refresh_tdr_api_client()
datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
snapshots_api = data_repo_client.SnapshotsApi(api_client=api_client)
snapshot_details = snapshots_api.retrieve_snapshot(id=snapshot_id).to_dict()
dataset_id = snapshot_details["source"][0]["dataset"]["id"]
phs_id = snapshot_details["source"][0]["dataset"]["phs_id"]

# Retrieve dataset details
dataset_details = datasets_api.retrieve_dataset(id=dataset_id, include=["PROPERTIES"]).to_dict()
if dataset_details["properties"].get("auth_domains"):
    auth_domain = dataset_details["properties"]["auth_domains"][0]
if dataset_details["properties"].get("source_workspaces"):
    source_workspace = dataset_details["properties"]["source_workspaces"][0]

# Print output
print(phs_id)
print(source_workspace)

## Pulling Workspace Attributes

In [None]:
# Parameters
ws_project = "anvil-datastorage"
ws_name = "AnVIL_GREGOR_RELEASE_01_HMB"

# Establish credentials
creds, project = google.auth.default()
auth_req = google.auth.transport.requests.Request()
creds.refresh(auth_req)

# Pull workspace attributes
ws_attributes = requests.get(
    url=f"https://api.firecloud.org/api/workspaces/{ws_project}/{ws_name}?fields=workspace.attributes,workspace.authorizationDomain,workspace.googleProject,workspace.bucketName",
    headers={"Authorization": f"Bearer {creds.token}"}
).json()

# Map to schema
terra_dict = {}
terra_dict["studyName"] = ws_attributes["workspace"]["attributes"].get("library:projectName")
terra_dict["studyType"] = ws_attributes["workspace"]["attributes"].get("library:studyDesign")
#terra_dict["studyDescription"] = ws_attributes["workspace"]["attributes"].get("description")
terra_dict["dataTypes"] = ws_attributes["workspace"]["attributes"].get("library:dataCategory")["items"]
terra_dict["phenotypeIndication"] = ws_attributes["workspace"]["attributes"].get("library:indication")
terra_dict["species"] = "Homo sapiens"
terra_dict["piName"] = ws_attributes["workspace"]["attributes"].get("library:datasetOwner")
terra_dict["dataCustodianEmail"] = ws_attributes["workspace"]["attributes"].get("library:contactEmail")
if ws_attributes["workspace"]["attributes"].get("tag:tags"):
    for tag in ws_attributes["workspace"]["attributes"].get("tag:tags")["items"]:
        if "Consortium:" in tag:
            terra_dict["consortium"] = tag.split(":")[1].strip()
        elif "dbGaP:" in tag:
            terra_dict["dbGaPPhsID"] = tag.split(":")[1].strip()
terra_dict["consentGroups.consentCode"] = ws_attributes["workspace"]["attributes"]["library:dataUseRestriction"] 
terra_dict["consentGroups.fileTypes.fileType"] = ws_attributes["workspace"]["attributes"]["library:datatype"]["items"]

# View schema
print(terra_dict)


In [None]:
ws_attributes

In [None]:
ws_attributes

## dbGaP XML Parse

In [None]:
# Parameters
phs_id = "phs003047"
#phs_id = "phs000693"

# Pull and parse XML
phs_short = phs_id.replace("phs", "")
dbgap_url = "https://dbgap.ncbi.nlm.nih.gov/ss/dbgapssws.cgi?request=Study&phs=" + phs_short
response = requests.get(url=dbgap_url)
xml_data = xmltodict.parse(response.text)

# Map to schema
dbgap_xml_dict = {}
if isinstance(xml_data["dbgapss"]["Study"], list):
    study_data = xml_data["dbgapss"]["Study"][0]
else:
    study_data = xml_data["dbgapss"]["Study"] 
dbgap_xml_dict["studyName"] = study_data["StudyInfo"].get("StudyNameEntrez")
dbgap_xml_dict["studyDescription"] = study_data["StudyInfo"].get("Description")
dbgap_xml_dict["dbGaPPhsID"] = phs_id
dbgap_xml_dict["dbGaPStudyRegistrationName"] = study_data["StudyInfo"].get("StudyNameEntrez")
for ap_entry in study_data["Authority"]["Persons"]["Person"]:
    if ap_entry["Role"] == "PI":
        dbgap_xml_dict["piName"] = ap_entry["@lname"] + ", " + ap_entry["@fname"]
        dbgap_xml_dict["piEmail"] = ap_entry["@email"]
        dbgap_xml_dict["piInstitution"] = ap_entry["Organization"]
    elif ap_entry["Role"] == "PO" and ap_entry["Organization"] == "NIH":
        dbgap_xml_dict["nihProgramOfficerName"] = ap_entry["@lname"] + ", " + ap_entry["@fname"]
    elif ap_entry["Role"] == "GPA" and ap_entry["Organization"] == "NIH":
        dbgap_xml_dict["nihGenomicProgramAdministratorName"] = ap_entry["@lname"] + ", " + ap_entry["@fname"]
ic_list = []
if isinstance(study_data["Authority"]["ICs"]["IC"], list):
    for ic_entry in study_data["Authority"]["ICs"]["IC"]:
        ic_list.append(ic_entry["@name"])
else:
    ic_list.append(study_data["Authority"]["ICs"]["IC"]["@name"])
dbgap_xml_dict["nihICsSupportingStudy"] = ic_list
dbgap_xml_dict["numberOfParticipants"] = study_data.get("@num_participants")
dbgap_xml_dict["embargoReleaseDate"] = study_data["Policy"].get("@pub-embargo")

# View schema
print(dbgap_xml_dict)


In [None]:
study_data

In [None]:
study_data

## dbGaP Study API

In [None]:
# Parameters
study_uid = 483191234

# Pull and parse JSON
dbgap_study_url = "https://submit.ncbi.nlm.nih.gov/dbgap/api/v1/study_config/" + str(study_uid)
response = requests.get(url=dbgap_study_url)
study_api_data = json.loads(response.text)

# Map to schema
dbgap_study_api_dict = {}
if study_api_data.get("error") == None:
    dbgap_study_api_dict["studyName"] = study_api_data["data"].get("report_name")
    dbgap_study_api_dict["studyDescription"] = study_api_data["data"].get("description")
    dbgap_study_api_dict["phenotypeIndication"] = study_api_data["data"].get("primary_disease")
    dbgap_study_api_dict["studyType"] = study_api_data["data"].get("study_design")
    for attr_entry in study_api_data["data"].get("attribution"):
        if attr_entry.get("title") == "Principal Investigator":
            dbgap_study_api_dict["piName"] = attr_entry.get("name")
            dbgap_study_api_dict["piInstitution"] = attr_entry.get("institute")
            break

# View schema
print(dbgap_study_api_dict)

In [None]:
study_api_data

## dbGaP FHIR API

In [None]:
# Parameters
#phs_id = "phs003047"
phs_id = "phs000693"

# Pull and parse JSON
dbgap_fhir_url = "https://dbgap-api.ncbi.nlm.nih.gov/fhir/x1/ResearchStudy?_format=json&_id=" + phs_id
response = requests.get(url=dbgap_fhir_url)
fhir_data = json.loads(response.text)

# Map to schema
dbgap_fhir_dict = {}
dbgap_fhir_dict["studyName"] = fhir_data["entry"][0]["resource"].get("title")
dbgap_fhir_dict["studyDescription"] = fhir_data["entry"][0]["resource"].get("description")
dbgap_fhir_dict["dbGaPPhsID"] = phs_id
dbgap_fhir_dict["dbGaPStudyRegistrationName"] = fhir_data["entry"][0]["resource"].get("title")
dbgap_fhir_dict["nihICsSupportingStudy"] = fhir_data["entry"][0]["resource"]["sponsor"].get("display")
# studyType
for cat_entry in fhir_data["entry"][0]["resource"].get("category"):
    for coding_entry in cat_entry.get("coding"):
        if coding_entry.get("system") == "https://dbgap-api.ncbi.nlm.nih.gov/fhir/x1/CodeSystem/ResearchStudy-StudyDesign":
            value = coding_entry.get("display") if coding_entry.get("display") else coding_entry.get("code")
            if dbgap_fhir_dict.get("studyType") and value:
                dbgap_fhir_dict["studyType"] += f", {value}"
            elif value:
                dbgap_fhir_dict["studyType"] = value
# dataTypes
dt_list = []
for ext_entry in fhir_data["entry"][0]["resource"].get("extension"):
    if ext_entry.get("url") == "https://dbgap-api.ncbi.nlm.nih.gov/fhir/x1/StructureDefinition/ResearchStudy-MolecularDataTypes":
        for inner_ext_entry in ext_entry.get("extension"):
            if inner_ext_entry.get("url") == "https://dbgap-api.ncbi.nlm.nih.gov/fhir/x1/StructureDefinition/ResearchStudy-MolecularDataTypes-MolecularDataType":
                for coding_entry in inner_ext_entry["valueCodeableConcept"].get("coding"):
                    dt_list.append(coding_entry.get("code"))
dbgap_fhir_dict["dataTypes"] = dt_list
# phenotypeIndication
for focus_entry in fhir_data["entry"][0]["resource"].get("focus"):
    for coding_entry in focus_entry.get("coding"):
        value = coding_entry.get("display") if coding_entry.get("display") else coding_entry.get("code")
        if dbgap_fhir_dict.get("phenotypeIndication") and value:
            dbgap_fhir_dict["phenotypeIndication"] += f", {value}"
        elif value:
            dbgap_fhir_dict["phenotypeIndication"] = value
# numberOfParticipants
for ext_entry in fhir_data["entry"][0]["resource"].get("extension"):
    if ext_entry.get("url") == "https://dbgap-api.ncbi.nlm.nih.gov/fhir/x1/StructureDefinition/ResearchStudy-Content":
        for inner_ext_entry in ext_entry.get("extension"):
            if inner_ext_entry.get("url") == "https://dbgap-api.ncbi.nlm.nih.gov/fhir/x1/StructureDefinition/ResearchStudy-Content-NumSubjects":
                dbgap_fhir_dict["numberOfParticipants"] = inner_ext_entry["valueCount"].get("code")

# View schema
print(dbgap_fhir_dict)

In [None]:
fhir_data

# Utilities

## Delete Studies from DUOS

In [None]:
# Inputs
token = "ya29.a0AfB_byAm_jdIP_OjUewqvX_GmDcapjF4wxeRuDs_SEytpi_Z-ebuH4dGI_I4SJiojb_fF-sLP-nE29uhHl9c5KK2-bp1KM_XuPTBRWBddINlwijoJWjQs0LdD5nZ0D0LjPEVkVJOeEGzHgC46qOOgh74Wr1I_kZ-zS1ZAbqEAKwaCgYKAeESARMSFQHGX2MibOjNTp9U33mTJt6rW3H2dg0178"
study_id_list = [
    
]

# Delete studies
for study_id in study_id_list:
    print(f"Deleting study ID {study_id}")
    response = requests.delete(
        url=f"https://consent.dsde-dev.broadinstitute.org/api/dataset/study/{study_id}",
        headers={"Authorization": f"Bearer {token}"} 
    )
    if response.status_code == 200:
        print("Study deleted successfully.")
    else:
        msg = response.json()["message"]
        print(f"Error deleting study: {msg}")
    

## Build Lookup of Datasets and Studies in DUOS

In [None]:
# Inputs
token = "ya29.a0AfB_byAm_jdIP_OjUewqvX_GmDcapjF4wxeRuDs_SEytpi_Z-ebuH4dGI_I4SJiojb_fF-sLP-nE29uhHl9c5KK2-bp1KM_XuPTBRWBddINlwijoJWjQs0LdD5nZ0D0LjPEVkVJOeEGzHgC46qOOgh74Wr1I_kZ-zS1ZAbqEAKwaCgYKAeESARMSFQHGX2MibOjNTp9U33mTJt6rW3H2dg0178"
user_id = 5100 # Set to None to return all datasets/studies, otherwise will filter on those created or updated by the specified user

# Pull a list of existing datasets and studies from DUOS and build lookup dicts
datasets = requests.get(
    url=f"https://consent.dsde-dev.broadinstitute.org/api/dataset/v2?asCustodian=false",
    headers={"Authorization": f"Bearer {token}"}
).json()
study_lookup = {}
dataset_lookup = {}
if user_id:
    for dataset_entry in datasets:
        created_user = dataset_entry.get("createUserId") if dataset_entry.get("createUserId") else 0
        updated_user = dataset_entry.get("updateUserId") if dataset_entry.get("createUserId") else 0
        if dataset_entry["study"].get("name") and (created_user == user_id or updated_user == user_id):
            if not study_lookup.get(dataset_entry["study"]["name"]):
                study_lookup[dataset_entry["study"]["name"]] = dataset_entry["study"]["studyId"]
        if dataset_entry.get("name") and (created_user == user_id or updated_user == user_id):
            dataset_lookup[dataset_entry["name"]] = dataset_entry["dataSetId"]
else:
    for dataset_entry in datasets:
        if dataset_entry["study"].get("name"):
            if not study_lookup.get(dataset_entry["study"]["name"]):
                study_lookup[dataset_entry["study"]["name"]] = dataset_entry["study"]["studyId"]
        if dataset_entry.get("name"):
            dataset_lookup[dataset_entry["name"]] = dataset_entry["dataSetId"]    

In [None]:
study_list = []
for key, val in study_lookup.items():
    study_list.append(val)   

In [None]:
study_list

In [None]:
dataset_lookup