# Imports

In [4]:
import requests
import json
import google.auth
import xmltodict
import data_repo_client
import pandas as pd
import re

# Function to refresh TDR API client
def refresh_tdr_api_client():
    creds, project = google.auth.default()
    auth_req = google.auth.transport.requests.Request()
    creds.refresh(auth_req)
    config = data_repo_client.Configuration()
    config.host = "https://data.terra.bio"
    config.access_token = creds.token
    api_client = data_repo_client.ApiClient(configuration=config)
    api_client.client_side_validation = False
    return api_client

# Display options
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option("display.max_colwidth", None)
pd.set_option('display.width', 1000)
pd.set_option('display.colheader_justify', 'center')
pd.set_option('display.precision', 3)

# Putting it all together

In [31]:
#############################################
## Functions
#############################################

def coalesce(*arg): 
    return next((a for a in arg if a is not None), None)

def format_description(input_string):
    output_string = input_string if input_string else ""
    output_string = re.sub("\n\n\t", " ", output_string)
    output_string = re.sub("\t", " ", output_string)
    output_string = re.sub("study.cgi\?study_id=|.\/study.cgi\?study_id=", "https://www.ncbi.nlm.nih.gov/projects/gap/cgi-bin/study.cgi?study_id=", output_string)
    return output_string

def fetch_dataset_details(snapshot_id):
    
    # Initialize variables
    terra_dict = {}
    dbgap_xml_dict = {}
    dbgap_study_api_dict = {}
    dbgap_fhir_dict = {}
    final_results_dict = {}
    
    # Retrieve snapshot details
    api_client = refresh_tdr_api_client()
    datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
    snapshots_api = data_repo_client.SnapshotsApi(api_client=api_client)
    snapshot_details = snapshots_api.retrieve_snapshot(id=snapshot_id).to_dict()
    dataset_id = snapshot_details["source"][0]["dataset"]["id"]
    phs_id = snapshot_details["source"][0]["dataset"]["phs_id"]
    dataset_details = datasets_api.retrieve_dataset(id=dataset_id, include=["PROPERTIES"]).to_dict()
    if dataset_details["properties"].get("source_workspaces"):
        source_workspace = dataset_details["properties"]["source_workspaces"][0]
    else:
        source_workspace = None
    print("Snapshot PHS_ID: " + phs_id)
    print("Source Workspace: " + source_workspace)
    
    # Pull information from original workspace (if listed)
    if source_workspace:
        # Establish credentials
        creds, project = google.auth.default()
        auth_req = google.auth.transport.requests.Request()
        creds.refresh(auth_req)

        # Pull workspace attributes
        ws_attributes = requests.get(
            url=f"https://api.firecloud.org/api/workspaces/anvil-datastorage/{source_workspace}?fields=workspace.attributes,workspace.authorizationDomain,workspace.googleProject,workspace.bucketName",
            headers={"Authorization": f"Bearer {creds.token}"}
        ).json()

        # Map to schema
        terra_dict["studyName"] = ws_attributes["workspace"]["attributes"].get("library:projectName")
        terra_dict["studyType"] = ws_attributes["workspace"]["attributes"].get("library:studyDesign")
        terra_dict["studyDescription"] = ws_attributes["workspace"]["attributes"].get("description")
        if ws_attributes["workspace"]["attributes"].get("library:dataCategory"):
            terra_dict["dataTypes"] = ws_attributes["workspace"]["attributes"]["library:dataCategory"]["items"]
        terra_dict["phenotypeIndication"] = ws_attributes["workspace"]["attributes"].get("library:indication")
        terra_dict["species"] = "Homo sapiens"
        terra_dict["piName"] = ws_attributes["workspace"]["attributes"].get("library:datasetOwner")
        terra_dict["dataCustodianEmail"] = ws_attributes["workspace"]["attributes"].get("library:contactEmail")
        if ws_attributes["workspace"]["attributes"].get("tag:tags"):
            for tag in ws_attributes["workspace"]["attributes"].get("tag:tags")["items"]:
                if "Consortium:" in tag:
                    terra_dict["consortium"] = tag.split(":")[1].strip()
                elif "dbGaP:" in tag:
                    terra_dict["dbGaPPhsID"] = tag.split(":")[1].strip()
                    if not phs_id:
                        phs_id = tag.split(":")[1].strip() 
        terra_dict["consentGroups.consentCode"] = ws_attributes["workspace"]["attributes"].get("library:dataUseRestriction")
        if ws_attributes["workspace"]["attributes"].get("library:datatype"):
            terra_dict["consentGroups.fileTypes.fileType"] = ws_attributes["workspace"]["attributes"]["library:datatype"]["items"]
#         print("------------------------------------------------------")
#         print("terra_dict")
#         print(terra_dict)
        
    # Pull information from dbGaP (if phs_id listed)
    print("PHS ID for dbGaP: " + phs_id)
    if phs_id:
        # Pull and parse XML
        phs_short = phs_id.replace("phs", "")
        dbgap_url = "https://dbgap.ncbi.nlm.nih.gov/ss/dbgapssws.cgi?request=Study&phs=" + phs_short
        response = requests.get(url=dbgap_url)
        xml_data = xmltodict.parse(response.text)
        study_uid = ""

        # Map to schema
        if xml_data["dbgapss"].get("Study"):
            if isinstance(xml_data["dbgapss"]["Study"], list):
                study_data = xml_data["dbgapss"]["Study"][0]
            else:
                study_data = xml_data["dbgapss"]["Study"] 
            study_uid = study_data.get("@uid")
            dbgap_xml_dict["studyName"] = study_data["StudyInfo"].get("StudyNameEntrez")
            dbgap_xml_dict["studyDescription"] = study_data["StudyInfo"].get("Description")
            dbgap_xml_dict["dbGaPPhsID"] = phs_id
            dbgap_xml_dict["dbGaPStudyRegistrationName"] = study_data["StudyInfo"].get("StudyNameEntrez")
            if study_data["Authority"]["Persons"].get("Person"):
                for ap_entry in study_data["Authority"]["Persons"]["Person"]:
                    if ap_entry["Role"] == "PI":
                        dbgap_xml_dict["piName"] = ap_entry["@lname"] + ", " + ap_entry["@fname"]
                        dbgap_xml_dict["piEmail"] = ap_entry["@email"]
                        dbgap_xml_dict["piInstitution"] = ap_entry["Organization"]
                    elif ap_entry["Role"] == "PO" and ap_entry["Organization"] == "NIH":
                        dbgap_xml_dict["nihProgramOfficerName"] = ap_entry["@lname"] + ", " + ap_entry["@fname"]
                    elif ap_entry["Role"] == "GPA" and ap_entry["Organization"] == "NIH":
                        dbgap_xml_dict["nihGenomicProgramAdministratorName"] = ap_entry["@lname"] + ", " + ap_entry["@fname"]
            ic_list = []
            if isinstance(study_data["Authority"]["ICs"]["IC"], list):
                for ic_entry in study_data["Authority"]["ICs"]["IC"]:
                    ic_list.append(ic_entry["@name"])
            else:
                ic_list.append(study_data["Authority"]["ICs"]["IC"]["@name"])
            dbgap_xml_dict["nihICsSupportingStudy"] = ic_list
            dbgap_xml_dict["numberOfParticipants"] = study_data.get("@num_participants")
            dbgap_xml_dict["embargoReleaseDate"] = study_data["Policy"].get("@pub-embargo")
#             print("------------------------------------------------------")
#             print("dbgap_xml_dict")
#             print(dbgap_xml_dict)
        
        # Pull and parse Study API JSON
        if study_uid:
            dbgap_study_url = "https://submit.ncbi.nlm.nih.gov/dbgap/api/v1/study_config/" + str(study_uid)
            response = requests.get(url=dbgap_study_url)
            study_api_data = json.loads(response.text)

            # Map to schema
            if study_api_data.get("error") == None:
                dbgap_study_api_dict["studyName"] = study_api_data["data"].get("report_name")
                dbgap_study_api_dict["studyDescription"] = study_api_data["data"].get("description")
                dbgap_study_api_dict["phenotypeIndication"] = study_api_data["data"].get("primary_disease")
                dbgap_study_api_dict["studyType"] = study_api_data["data"].get("study_design")
                dbgap_study_api_dict["dbGaPPhsID"] = phs_id
                dbgap_study_api_dict["dbGaPStudyRegistrationName"] = study_api_data["data"].get("report_name")
                for attr_entry in study_api_data["data"].get("attribution"):
                    if attr_entry.get("title") == "Principal Investigator":
                        dbgap_study_api_dict["piName"] = attr_entry.get("name")
                        dbgap_study_api_dict["piInstitution"] = attr_entry.get("institute")
                        break
#             print("------------------------------------------------------")
#             print("dbgap_study_api_dict")
#             print(dbgap_study_api_dict)
        
        # Pull and parse FHIR API JSON
        dbgap_fhir_url = "https://dbgap-api.ncbi.nlm.nih.gov/fhir/x1/ResearchStudy?_format=json&_id=" + phs_id
        response = requests.get(url=dbgap_fhir_url)
        fhir_data = json.loads(response.text)

        # Map to schema
        if fhir_data.get("entry"):
            dbgap_fhir_dict["studyName"] = fhir_data["entry"][0]["resource"].get("title")
            dbgap_fhir_dict["studyDescription"] = fhir_data["entry"][0]["resource"].get("description")
            dbgap_fhir_dict["dbGaPPhsID"] = phs_id
            dbgap_fhir_dict["dbGaPStudyRegistrationName"] = fhir_data["entry"][0]["resource"].get("title")
            dbgap_fhir_dict["nihICsSupportingStudy"] = fhir_data["entry"][0]["resource"]["sponsor"].get("display")
            # studyType
            if fhir_data["entry"][0]["resource"].get("category"):
                for cat_entry in fhir_data["entry"][0]["resource"].get("category"):
                    for coding_entry in cat_entry.get("coding"):
                        if coding_entry.get("system") == "https://dbgap-api.ncbi.nlm.nih.gov/fhir/x1/CodeSystem/ResearchStudy-StudyDesign":
                            value = coding_entry.get("display") if coding_entry.get("display") else coding_entry.get("code")
                            if dbgap_fhir_dict.get("studyType") and value:
                                dbgap_fhir_dict["studyType"] += f", {value}"
                            elif value:
                                dbgap_fhir_dict["studyType"] = value
            # dataTypes
            dt_list = []
            if fhir_data["entry"][0]["resource"].get("extension"): 
                for ext_entry in fhir_data["entry"][0]["resource"].get("extension"):
                    if ext_entry.get("url") == "https://dbgap-api.ncbi.nlm.nih.gov/fhir/x1/StructureDefinition/ResearchStudy-MolecularDataTypes":
                        for inner_ext_entry in ext_entry.get("extension"):
                            if inner_ext_entry.get("url") == "https://dbgap-api.ncbi.nlm.nih.gov/fhir/x1/StructureDefinition/ResearchStudy-MolecularDataTypes-MolecularDataType":
                                for coding_entry in inner_ext_entry["valueCodeableConcept"].get("coding"):
                                    dt_list.append(coding_entry.get("code"))
            dbgap_fhir_dict["dataTypes"] = dt_list
            # phenotypeIndication
            if fhir_data["entry"][0]["resource"].get("focus"):
                for focus_entry in fhir_data["entry"][0]["resource"].get("focus"):
                    for coding_entry in focus_entry.get("coding"):
                        value = coding_entry.get("display") if coding_entry.get("display") else coding_entry.get("code")
                        if dbgap_fhir_dict.get("phenotypeIndication") and value:
                            dbgap_fhir_dict["phenotypeIndication"] += f", {value}"
                        elif value:
                            dbgap_fhir_dict["phenotypeIndication"] = value
            # numberOfParticipants
            if fhir_data["entry"][0]["resource"].get("extension"):
                for ext_entry in fhir_data["entry"][0]["resource"].get("extension"):
                    if ext_entry.get("url") == "https://dbgap-api.ncbi.nlm.nih.gov/fhir/x1/StructureDefinition/ResearchStudy-Content":
                        for inner_ext_entry in ext_entry.get("extension"):
                            if inner_ext_entry.get("url") == "https://dbgap-api.ncbi.nlm.nih.gov/fhir/x1/StructureDefinition/ResearchStudy-Content-NumSubjects":
                                dbgap_fhir_dict["numberOfParticipants"] = inner_ext_entry["valueCount"].get("code")
#         print("------------------------------------------------------")
#         print("dbgap_fhir_dict")
#         print(dbgap_fhir_dict)
    
    # Reconcile information and create final results
    final_results_dict["snapshot_id"] = snapshot_id
    final_results_dict["studyName"] = coalesce(dbgap_fhir_dict.get("studyName"), dbgap_xml_dict.get("studyName"), dbgap_study_api_dict.get("studyName"), terra_dict.get("studyName"))
    final_results_dict["studyType"] = coalesce(terra_dict.get("studyType"), dbgap_fhir_dict.get("studyType"), dbgap_xml_dict.get("studyType"), dbgap_study_api_dict.get("studyType"))
    final_results_dict["studyDescription"] = format_description(coalesce(dbgap_fhir_dict.get("studyDescription"), dbgap_xml_dict.get("studyDescription"), dbgap_study_api_dict.get("studyDescription"), terra_dict.get("studyDescription")))
    final_results_dict["dataTypes"] = coalesce(terra_dict.get("dataTypes"), dbgap_fhir_dict.get("dataTypes"), dbgap_xml_dict.get("dataTypes"), dbgap_study_api_dict.get("dataTypes"))
    final_results_dict["phenotypeIndication"] = coalesce(terra_dict.get("phenotypeIndication"), dbgap_fhir_dict.get("phenotypeIndication"), dbgap_xml_dict.get("phenotypeIndication"), dbgap_study_api_dict.get("phenotypeIndication"))
    final_results_dict["species"] = "Homo sapiens"
    final_results_dict["piName"] = coalesce(dbgap_fhir_dict.get("piName"), dbgap_xml_dict.get("piName"), dbgap_study_api_dict.get("piName"), terra_dict.get("piName"))
    final_results_dict["piEmail"] = coalesce(dbgap_fhir_dict.get("piEmail"), dbgap_xml_dict.get("piEmail"), dbgap_study_api_dict.get("piEmail"), terra_dict.get("piEmail"))
    final_results_dict["piInstitution"] = coalesce(dbgap_fhir_dict.get("piInstitution"), dbgap_xml_dict.get("piInstitution"), dbgap_study_api_dict.get("piInstitution"), terra_dict.get("piInstitution"))
    final_results_dict["dataCustodianEmail"] = coalesce(terra_dict.get("dataCustodianEmail"), dbgap_fhir_dict.get("dataCustodianEmail"), dbgap_xml_dict.get("dataCustodianEmail"), dbgap_study_api_dict.get("dataCustodianEmail"))
    final_results_dict["dbGaPPhsID"] = coalesce(dbgap_fhir_dict.get("dbGaPPhsID"), dbgap_xml_dict.get("dbGaPPhsID"), dbgap_study_api_dict.get("dbGaPPhsID"), terra_dict.get("dbGaPPhsID"))
    final_results_dict["dbGaPStudyRegistrationName"] = coalesce(dbgap_fhir_dict.get("dbGaPStudyRegistrationName"), dbgap_xml_dict.get("dbGaPStudyRegistrationName"), dbgap_study_api_dict.get("dbGaPStudyRegistrationName"), terra_dict.get("dbGaPStudyRegistrationName"))
    final_results_dict["embargoReleaseDate"] = coalesce(dbgap_fhir_dict.get("embargoReleaseDate"), dbgap_xml_dict.get("embargoReleaseDate"), dbgap_study_api_dict.get("embargoReleaseDate"), terra_dict.get("embargoReleaseDate"))
    final_results_dict["nihICsSupportingStudy"] = coalesce(dbgap_fhir_dict.get("nihICsSupportingStudy"), dbgap_xml_dict.get("nihICsSupportingStudy"), dbgap_study_api_dict.get("nihICsSupportingStudy"), terra_dict.get("nihICsSupportingStudy"))
    final_results_dict["nihProgramOfficerName"] = coalesce(dbgap_fhir_dict.get("nihProgramOfficerName"), dbgap_xml_dict.get("nihProgramOfficerName"), dbgap_study_api_dict.get("nihProgramOfficerName"), terra_dict.get("nihProgramOfficerName"))
    final_results_dict["nihGenomicProgramAdministratorName"] = coalesce(dbgap_fhir_dict.get("nihGenomicProgramAdministratorName"), dbgap_xml_dict.get("nihGenomicProgramAdministratorName"), dbgap_study_api_dict.get("nihGenomicProgramAdministratorName"), terra_dict.get("nihGenomicProgramAdministratorName"))
    final_results_dict["consortium"] = coalesce(terra_dict.get("consortium"), dbgap_fhir_dict.get("consortium"), dbgap_xml_dict.get("consortium"), dbgap_study_api_dict.get("consortium"))
    final_results_dict["consentGroups.consentCode"] = coalesce(terra_dict.get("consentGroups.consentCode"), dbgap_fhir_dict.get("consentGroups.consentCode"), dbgap_xml_dict.get("consentGroups.consentCode"), dbgap_study_api_dict.get("consentGroups.consentCode"))
    final_results_dict["numberOfParticipants"] = coalesce(terra_dict.get("numberOfParticipants"), dbgap_fhir_dict.get("numberOfParticipants"), dbgap_xml_dict.get("numberOfParticipants"), dbgap_study_api_dict.get("numberOfParticipants"))
    final_results_dict["dataLocation"] = "TDR Location"
    final_results_dict["url"] = "https://data.terra.bio/snapshots/" + snapshot_id
    final_results_dict["consentGroups.fileTypes.fileType"] = coalesce(terra_dict.get("consentGroups.fileTypes.fileType"), dbgap_fhir_dict.get("consentGroups.fileTypes.fileType"), dbgap_xml_dict.get("consentGroups.fileTypes.fileType"), dbgap_study_api_dict.get("consentGroups.fileTypes.fileType"))
    
    # Return results
    return final_results_dict


#############################################
## Input Parameters
#############################################

# Specify the users to manage access for and the role they should have:
snapshot_id_list = [
'fc513b58-cfb7-4871-8694-8dc372fc2e10',
'a1dcd80f-6390-489a-a34a-168f26690a36',
'e91ccc70-2772-46d8-b586-cf3e270a05b5',
'6fdea8c7-69d9-466e-9fa2-aca30722ff68',
'02d25240-823f-4b1d-8562-95385716a453',
'99b46287-4790-492c-8a12-bea33f0f927c',
'c6ef5822-3929-4ae7-b5bc-dc27528bf226',
'08d19a7e-b868-4766-9f7e-d879d972cbd7',
'1974a21b-c409-4736-a3d7-e195fa96c4eb',
'8fd5b447-77b6-4c33-b66a-a5cc63587220',
'aa2bfacc-c28c-4192-960c-b1389cf68516',
'44b1f60b-e74c-4430-9378-d4a75e2de72f',
'5b036d13-e058-4d8d-be91-6fdd070686a7',
'0a356156-961d-4829-b9b5-c07fbc73dacc',
'e43974fd-cee1-4d8c-a436-6846d7d24129',
'4c8ce027-8094-4f5d-bf62-22b1d51b3c1e',
'e5ccacfe-1b14-4331-bd8f-a542b5a70d23',
'2efa7d84-2850-4e6f-bb26-7d13ad147b44',
'1410a32b-4ee6-4bd3-96d1-4848d38769d8',
'b77d83c7-2a8e-4f50-be1a-7848f28dc8cb',
'206009c6-cc98-45ab-b504-e6c3a3162a23',
'da841552-40bb-4f05-8edf-ad0a76ed13ac',
'96874f3e-3e02-400a-96d1-5bd20d4cbc09',
'651d2fd2-fc96-47b3-909d-0dd46f575dbc',
'84cfc3d8-282e-4102-ae43-5513e7a3efd5',
'40c4297e-d492-4f6a-b651-ee9ee38db14b',
'8956cc4d-58be-46ae-a81e-74607ffbd9d3',
'381737d9-c0ac-4a78-9883-2977516ee64d',
'6a5b3be6-d1de-4f23-a431-b08e7ab231b8',
'ffe34538-3ddd-48de-b4a2-94f9b2dad086',
'a4c62d7f-34f0-4e2e-9e46-c762d3ab0ff2',
'5208772d-21f9-46b0-8167-0b05b57296b8',
'b735d811-a7ed-4d82-8b9d-5f23a9f33936',
'367b7e53-512c-40c2-af8c-53477a79bcb7',
]

#############################################
## Execution
#############################################
dataset_details_records = []
for snapshot_id in snapshot_id_list:
    dataset_details = fetch_dataset_details(snapshot_id)
    dataset_details_records.append(dataset_details)
output = pd.DataFrame(dataset_details_records)
display(output)

Snapshot PHS_ID: 
Source Workspace: AnVIL_CCDG_WASHU_PAGE
PHS ID for dbGaP: TBD
Snapshot PHS_ID: 
Source Workspace: AnVIL_CCDG_Broad_NP_Epilepsy_JPNFKA_GRU_WES
PHS ID for dbGaP: 
Snapshot PHS_ID: phs001642
Source Workspace: anvil_ccdg_broad_ai_ibd_daly_bernstein_gsa
PHS ID for dbGaP: phs001642
Snapshot PHS_ID: phs000298
Source Workspace: AnVIL_CCDG_NHGRI_Broad_ASD_Daly_phs000298_WES_vcf
PHS ID for dbGaP: phs000298
Snapshot PHS_ID: phs002206
Source Workspace: AnVIL_CMH_GAFK_GS-linked-read
PHS ID for dbGaP: phs002206
Snapshot PHS_ID: phs002206
Source Workspace: AnVIL_CMH_GAFK_SCATAC
PHS ID for dbGaP: phs002206
Snapshot PHS_ID: phs002206
Source Workspace: AnVIL_CMH_GAFK_scRNA
PHS ID for dbGaP: phs002206
Snapshot PHS_ID: phs002206
Source Workspace: AnVIL_CMH_GAFK_WGBS
PHS ID for dbGaP: phs002206
Snapshot PHS_ID: phs002206
Source Workspace: AnVIL_CMH_GAFK_GS-long-read
PHS ID for dbGaP: phs002206
Snapshot PHS_ID: phs002032
Source Workspace: AnVIL_NIMH_Broad_ConvergentNeuro_McCarroll_Eggan_Fi

Unnamed: 0,snapshot_id,studyName,studyType,studyDescription,dataTypes,phenotypeIndication,species,piName,piEmail,piInstitution,dataCustodianEmail,dbGaPPhsID,dbGaPStudyRegistrationName,embargoReleaseDate,nihICsSupportingStudy,nihProgramOfficerName,nihGenomicProgramAdministratorName,consortium,consentGroups.consentCode,numberOfParticipants,dataLocation,url,consentGroups.fileTypes.fileType
0,fc513b58-cfb7-4871-8694-8dc372fc2e10,,TBD,,[],Unspecified,Homo sapiens,,,,help@lists.anvilproject.org,TBD,,,,,,CCDG,TBD,,TDR Location,https://data.terra.bio/snapshots/fc513b58-cfb7-4871-8694-8dc372fc2e10,[TBD]
1,a1dcd80f-6390-489a-a34a-168f26690a36,,Case Set,AnVIL_CCDG_Broad_NP_Epilepsy_JPNFKA_GRU_WES,[Raw Sequencing data],epilepsy,Homo sapiens,,,,ccusick@broadinstitute.org,,,,,,,CCDG,GRU,,TDR Location,https://data.terra.bio/snapshots/a1dcd80f-6390-489a-a34a-168f26690a36,[Exome]
2,e91ccc70-2772-46d8-b586-cf3e270a05b5,Center for Common Disease Genomics [CCDG] - Autoimmune: Inflammatory Bowel Disease (IBD) Exomes and Genomes,TBD,"\nThe National Human Genome Research Institute (NHGRI) has funded a collaborative large-scale genome sequencing effort to comprehensively identify rare risk and protective variants contributing to multiple common disease phenotypes. Called the Centers for Common Disease Genomics (CCDG), this initiative will explore a range of diseases with the ultimate goal of: undertaking variant discovery for enough different examples of disease architectures and study designs to better understand the general principles of the genomic architecture underlying common, complex inherited diseases; understanding how best to design rare variant studies for common disease; developing resources, informatics tools, and innovative approaches and technologies for multiple disease research communities and the wider biomedical research community. The initial focus of the CCDGs will be in cardiovascular disease (early-onset coronary artery disease, atrial fibrillation, hemorrhagic stroke), neuropsychiatric disease (epilepsy, autism), and autoimmune/inflammatory disease (type 1 diabetes, inflammatory bowel disease). The Broad Institute is one of four selected CCDG project centers. The overarching aim of the Inflammatory Bowel Disease (IBD) program is to define the full allelic spectrum of protein-altering variation in genes associated to IBD, and assess their role in both Crohn's Disease (CD) and Ulcerative Colitis (UC) risk. The whole genome sequencing data generated here is comprised of samples from US-based diverse populations including African American, Puerto-Rican, Caribean and Cuban origins. \n",[Genotyping Array data],inflammatory bowel disease,Homo sapiens,"Mark Daly, PhD",,"The Broad Institute, Cambridge, MA, USA",curley@broadinstitute.org,phs001642,Center for Common Disease Genomics [CCDG] - Autoimmune: Inflammatory Bowel Disease (IBD) Exomes and Genomes,none,National Human Genome Research Institute,"Felsenfeld, Adam","Strasburger, Jennifer",CCDG,TBD,1.0,TDR Location,https://data.terra.bio/snapshots/e91ccc70-2772-46d8-b586-cf3e270a05b5,[Genotyping Array]
3,6fdea8c7-69d9-466e-9fa2-aca30722ff68,Autism Sequencing Consortium (ASC),Unspecified,"\nThe ARRA Autism Sequencing Collaboration was created in 2010 bringing together expert large-scale sequencing center (at the Baylor College of Medicine, PI Richard Gibbs and the Board Institute of MIT and Harvard, PI Mark J. Daly) and a collaborative network of research labs focused on the genetics of autism (brought together by the Autism Genome Project and the Autism Consortium). These groups worked together to utilize dramatic new advances in DNA sequencing technology to reveal the genetic architecture of autism through comprehensive examination of the exotic sequence of all genes. The Autism Sequencing Consortium (ASC) was founded by Joseph D. Buxbaum and colleagues as an international group of scientists who share autism spectrum disorder (ASD) samples and genetic data. The PIs are Drs. Joseph D. Buxbaum (Icahn School of Medicine at Mount Sinai), Mark J. Daly (Broad Institute of MIT and Harvard), Bernie Devlin (University of Pittsburgh School of Medicine), Kathryn Roeder (Carnegie Mellon University, Matthew State and Stephan Sanders (University of California, San Francisco). The rationale for the ASC is described in [Buxbaum et al. 2012](https://www.ncbi.nlm.nih.gov/pubmed/23259942), and this paper should be cited when referencing the data set. All shared data and analysis is hosted at a single site, which enables joint analysis of large-scale data from many groups. The ASC was first supported by a cooperative agreement grant to four lead sites funded by the National Institute of Mental Health (U01MH100233, U01MH100209, U01MH100229, U01MH100239), with additional support from the National Human Genome Research Institute. The NIMH recently renewed their support with a second grant (U01MH111661, U01MH111660, U01MH111658 and U01MH111662) to expand the project from 29,000 genomes to more than 50,000 exomes over the next 5 years. NHGRI provides ongoing sequencing support for the ASD through the Broad Center for Common Disease Genomics (UM1HG008895, Mark Daly, PI).\n","[SNP/CNV Genotypes (NGS), WXS]",Autistic Disorder,Homo sapiens,"Daly, Mark",mjdaly@broadinstitute.org,"MASSACHUSETTS INSTITUTE OF TECHNOLOGYBROAD INSTITUTE, INC.MASSACHUSETTS GENERAL HOSPITAL",,phs000298,Autism Sequencing Consortium (ASC),none,National Institute of Mental Health,"Senthil, Geetha","Farber, Gregory",CCDG,,1.0,TDR Location,https://data.terra.bio/snapshots/6fdea8c7-69d9-466e-9fa2-aca30722ff68,
4,02d25240-823f-4b1d-8562-95385716a453,Genomic Answers for Kids (GA4K),Case only / Trio / Family,"\nIdentification of complex genetic variants including defects in gene regulatory circuits and uncharacterized genes present challenges for rare disease diagnosis. In Genomic Answers for Kids program we apply the joint interpretation of patient genome sequence with genomic endophenotypes to expand the clinically actionable genome among pediatric cases of suspected genetic disease. Our objective is to examine the role of novel and under-interpreted sequence variation by combining complete DNA sequences, personal epigenetic variations and massively parallel functional screens.\n\nTo achieve this we perform augmented whole genome sequence (WGS) interpretation including reanalysis of clinical exomes as well as generating WGS for patients with negative exome results all be subject to family-based semi-automated recall pipeline to discover missed diagnostic variation. We use linked-read and long-read sequencing technologies to focus on putative structural variants missed in short-read genome and exome analysis by the optimized integration of linked and long read technologies that also improve identification of transmission patterns of all variants, as well as resolving genomic regions resistant to standard alignment. We also include tissue DNA sequencing to investigate somatic mosaicism.\n\nTo further assist on WGS interpretation for uncommon variation we capture snapshots of patient transcriptomes and epigenomes in individual cells using single-cell RNA (scRNA) and sc open chromatin (scATAC) as well as bulk whole genome bisulphite genome sequencing for methylome interpretation. Alternative splicing is functionally assessed in available patient tissues using RNA-seq, including full length cDNA sequences by IsoSeq (PacBio) methodology.\n\nOur overarching goal is an increase rate of diagnostic genomic findings up to two-fold among rare disease, resulting in the majority (>50%) of patient cases resolved by the integrated system developed in our program.\n",[],pediatric disease,Homo sapiens,"Tomi Pastinen MD, PhD",,"Children's Mercy Hospital, Kansas City, MO, USA",tpastinen@cmh.edu,phs002206,Genomic Answers for Kids (GA4K),none,National Human Genome Research Institute,,"Strasburger, Jennifer",CMH,DS-PEDD-IRB,1.0,TDR Location,https://data.terra.bio/snapshots/02d25240-823f-4b1d-8562-95385716a453,[Whole Genome]
5,99b46287-4790-492c-8a12-bea33f0f927c,Genomic Answers for Kids (GA4K),TBD,"\nIdentification of complex genetic variants including defects in gene regulatory circuits and uncharacterized genes present challenges for rare disease diagnosis. In Genomic Answers for Kids program we apply the joint interpretation of patient genome sequence with genomic endophenotypes to expand the clinically actionable genome among pediatric cases of suspected genetic disease. Our objective is to examine the role of novel and under-interpreted sequence variation by combining complete DNA sequences, personal epigenetic variations and massively parallel functional screens.\n\nTo achieve this we perform augmented whole genome sequence (WGS) interpretation including reanalysis of clinical exomes as well as generating WGS for patients with negative exome results all be subject to family-based semi-automated recall pipeline to discover missed diagnostic variation. We use linked-read and long-read sequencing technologies to focus on putative structural variants missed in short-read genome and exome analysis by the optimized integration of linked and long read technologies that also improve identification of transmission patterns of all variants, as well as resolving genomic regions resistant to standard alignment. We also include tissue DNA sequencing to investigate somatic mosaicism.\n\nTo further assist on WGS interpretation for uncommon variation we capture snapshots of patient transcriptomes and epigenomes in individual cells using single-cell RNA (scRNA) and sc open chromatin (scATAC) as well as bulk whole genome bisulphite genome sequencing for methylome interpretation. Alternative splicing is functionally assessed in available patient tissues using RNA-seq, including full length cDNA sequences by IsoSeq (PacBio) methodology.\n\nOur overarching goal is an increase rate of diagnostic genomic findings up to two-fold among rare disease, resulting in the majority (>50%) of patient cases resolved by the integrated system developed in our program.\n",[],pediatric disease,Homo sapiens,"Tomi Pastinen MD, PhD",,"Children's Mercy Hospital, Kansas City, MO, USA",help@lists.anvilproject.org,phs002206,Genomic Answers for Kids (GA4K),none,National Human Genome Research Institute,,"Strasburger, Jennifer",CMH,DS-PEDD-IRB,1.0,TDR Location,https://data.terra.bio/snapshots/99b46287-4790-492c-8a12-bea33f0f927c,[TBD]
6,c6ef5822-3929-4ae7-b5bc-dc27528bf226,Genomic Answers for Kids (GA4K),TBD,"\nIdentification of complex genetic variants including defects in gene regulatory circuits and uncharacterized genes present challenges for rare disease diagnosis. In Genomic Answers for Kids program we apply the joint interpretation of patient genome sequence with genomic endophenotypes to expand the clinically actionable genome among pediatric cases of suspected genetic disease. Our objective is to examine the role of novel and under-interpreted sequence variation by combining complete DNA sequences, personal epigenetic variations and massively parallel functional screens.\n\nTo achieve this we perform augmented whole genome sequence (WGS) interpretation including reanalysis of clinical exomes as well as generating WGS for patients with negative exome results all be subject to family-based semi-automated recall pipeline to discover missed diagnostic variation. We use linked-read and long-read sequencing technologies to focus on putative structural variants missed in short-read genome and exome analysis by the optimized integration of linked and long read technologies that also improve identification of transmission patterns of all variants, as well as resolving genomic regions resistant to standard alignment. We also include tissue DNA sequencing to investigate somatic mosaicism.\n\nTo further assist on WGS interpretation for uncommon variation we capture snapshots of patient transcriptomes and epigenomes in individual cells using single-cell RNA (scRNA) and sc open chromatin (scATAC) as well as bulk whole genome bisulphite genome sequencing for methylome interpretation. Alternative splicing is functionally assessed in available patient tissues using RNA-seq, including full length cDNA sequences by IsoSeq (PacBio) methodology.\n\nOur overarching goal is an increase rate of diagnostic genomic findings up to two-fold among rare disease, resulting in the majority (>50%) of patient cases resolved by the integrated system developed in our program.\n",[],pediatric disease,Homo sapiens,"Tomi Pastinen MD, PhD",,"Children's Mercy Hospital, Kansas City, MO, USA",help@lists.anvilproject.org,phs002206,Genomic Answers for Kids (GA4K),none,National Human Genome Research Institute,,"Strasburger, Jennifer",CMH,DS-PEDD-IRB,1.0,TDR Location,https://data.terra.bio/snapshots/c6ef5822-3929-4ae7-b5bc-dc27528bf226,[TBD]
7,08d19a7e-b868-4766-9f7e-d879d972cbd7,Genomic Answers for Kids (GA4K),TBD,"\nIdentification of complex genetic variants including defects in gene regulatory circuits and uncharacterized genes present challenges for rare disease diagnosis. In Genomic Answers for Kids program we apply the joint interpretation of patient genome sequence with genomic endophenotypes to expand the clinically actionable genome among pediatric cases of suspected genetic disease. Our objective is to examine the role of novel and under-interpreted sequence variation by combining complete DNA sequences, personal epigenetic variations and massively parallel functional screens.\n\nTo achieve this we perform augmented whole genome sequence (WGS) interpretation including reanalysis of clinical exomes as well as generating WGS for patients with negative exome results all be subject to family-based semi-automated recall pipeline to discover missed diagnostic variation. We use linked-read and long-read sequencing technologies to focus on putative structural variants missed in short-read genome and exome analysis by the optimized integration of linked and long read technologies that also improve identification of transmission patterns of all variants, as well as resolving genomic regions resistant to standard alignment. We also include tissue DNA sequencing to investigate somatic mosaicism.\n\nTo further assist on WGS interpretation for uncommon variation we capture snapshots of patient transcriptomes and epigenomes in individual cells using single-cell RNA (scRNA) and sc open chromatin (scATAC) as well as bulk whole genome bisulphite genome sequencing for methylome interpretation. Alternative splicing is functionally assessed in available patient tissues using RNA-seq, including full length cDNA sequences by IsoSeq (PacBio) methodology.\n\nOur overarching goal is an increase rate of diagnostic genomic findings up to two-fold among rare disease, resulting in the majority (>50%) of patient cases resolved by the integrated system developed in our program.\n",[],pediatric disease,Homo sapiens,"Tomi Pastinen MD, PhD",,"Children's Mercy Hospital, Kansas City, MO, USA",help@lists.anvilproject.org,phs002206,Genomic Answers for Kids (GA4K),none,National Human Genome Research Institute,,"Strasburger, Jennifer",CMH,DS-PEDD-IRB,1.0,TDR Location,https://data.terra.bio/snapshots/08d19a7e-b868-4766-9f7e-d879d972cbd7,[TBD]
8,1974a21b-c409-4736-a3d7-e195fa96c4eb,Genomic Answers for Kids (GA4K),Case only / Trio / Family,"\nIdentification of complex genetic variants including defects in gene regulatory circuits and uncharacterized genes present challenges for rare disease diagnosis. In Genomic Answers for Kids program we apply the joint interpretation of patient genome sequence with genomic endophenotypes to expand the clinically actionable genome among pediatric cases of suspected genetic disease. Our objective is to examine the role of novel and under-interpreted sequence variation by combining complete DNA sequences, personal epigenetic variations and massively parallel functional screens.\n\nTo achieve this we perform augmented whole genome sequence (WGS) interpretation including reanalysis of clinical exomes as well as generating WGS for patients with negative exome results all be subject to family-based semi-automated recall pipeline to discover missed diagnostic variation. We use linked-read and long-read sequencing technologies to focus on putative structural variants missed in short-read genome and exome analysis by the optimized integration of linked and long read technologies that also improve identification of transmission patterns of all variants, as well as resolving genomic regions resistant to standard alignment. We also include tissue DNA sequencing to investigate somatic mosaicism.\n\nTo further assist on WGS interpretation for uncommon variation we capture snapshots of patient transcriptomes and epigenomes in individual cells using single-cell RNA (scRNA) and sc open chromatin (scATAC) as well as bulk whole genome bisulphite genome sequencing for methylome interpretation. Alternative splicing is functionally assessed in available patient tissues using RNA-seq, including full length cDNA sequences by IsoSeq (PacBio) methodology.\n\nOur overarching goal is an increase rate of diagnostic genomic findings up to two-fold among rare disease, resulting in the majority (>50%) of patient cases resolved by the integrated system developed in our program.\n",[],"Genetic Diseases, Inborn",Homo sapiens,"Tomi Pastinen MD, PhD",,"Children's Mercy Hospital, Kansas City, MO, USA",tpastinen@cmh.edu,phs002206,Genomic Answers for Kids (GA4K),none,National Human Genome Research Institute,,"Strasburger, Jennifer",CMH,,1.0,TDR Location,https://data.terra.bio/snapshots/1974a21b-c409-4736-a3d7-e195fa96c4eb,[Whole Long-read Genome]
9,8fd5b447-77b6-4c33-b66a-a5cc63587220,Genetic Neuroscience: How Human Genes and Alleles Shape Neuronal Phenotypes,Prospective Longitudinal Cohort,"\nThe goal of this collaborative, interdisciplinary project is to develop powerful, generalizable approaches for discovering how risk variants for psychiatric disorders shape neurobiological processes at multiple levels of analysis, and to identify the processes whose dysregulation underlies disease.\n\nInduced pluripotent stem cells (iPSCs) were used towards the development of these new experimental and inferential systems bridging gaps between human genetics and experimental biology. The largest publicly available collection of iPSCs (2607 lines) has been generated from 2184 donors by the California Institute for Regenerative Medicine (CIRM). We wish to share the available SNP data for 2166 CIRM lines and whole genome sequence data generated at the Broad Institute for 299 of the CIRM iPSC donors. These data can be used to identify (for experiments) lines with specific genotypes of interest and lines from donors with high or low polygenic risk scores for phenotypes of interest. The data can also be used to identify acquired mutations in the iPSC lines. The CIRM iPSC lines are available through Fujifilm Cellular Dynamics iPSC Repository [[https://www.fujifilmcdi.com/cirm-ipsc-products/](https://www.fujifilmcdi.com/cirm-ipsc-products/)].\n\nAdditional project data registered with the study includes WGS data from an iPSC line derived from an SMA patient (n=1), as well as single-cell RNA sequence data and supplemental processed genomic datasets in support of project publications.\n\n**Molecular Datasets**\n\n - **Whole Genome Genotyping**: Illumina HumanCore chip - **Whole Genome Sequencing**: Illumina Novaseq - **Single-Cell RNA-Seq**: 10X Genomics, Illumina Novaseq - **Supplemental ""cell village"" pooled genomic sequence data**: Illumina Nextseq \n",[Raw Sequencing data],spinal muscular atrophy,Homo sapiens,Steve McCarroll,,,help@lists.anvilproject.org,phs002032,Genetic Neuroscience: How Human Genes and Alleles Shape Neuronal Phenotypes,none,National Institute of Mental Health,"Pevsner, Jonathan","Farber, Gregory",Convergent Neuroscience,DS-SMA-MDS,1.0,TDR Location,https://data.terra.bio/snapshots/8fd5b447-77b6-4c33-b66a-a5cc63587220,[Whole Genome]


# Fetch parameters from snapshot/dataset

In [2]:
# Parameters
snapshot_id = "099d2585-1379-4333-b3b1-ffc0d26d95c5"

# Retrieve snapshot details
api_client = refresh_tdr_api_client()
datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
snapshots_api = data_repo_client.SnapshotsApi(api_client=api_client)
snapshot_details = snapshots_api.retrieve_snapshot(id=snapshot_id).to_dict()
dataset_id = snapshot_details["source"][0]["dataset"]["id"]
phs_id = snapshot_details["source"][0]["dataset"]["phs_id"]

# Retrieve dataset details
dataset_details = datasets_api.retrieve_dataset(id=dataset_id, include=["PROPERTIES"]).to_dict()
if dataset_details["properties"].get("auth_domains"):
    auth_domain = dataset_details["properties"]["auth_domains"][0]
if dataset_details["properties"].get("source_workspaces"):
    source_workspace = dataset_details["properties"]["source_workspaces"][0]

# Print output
print(phs_id)
print(source_workspace)

phs000693
AnVIL_CMG_UWash_GRU


# Pulling Workspace Attributes

In [15]:
# Parameters
ws_project = "anvil-datastorage"
ws_name = "AnVIL_GREGOR_RELEASE_01_HMB"

# Establish credentials
creds, project = google.auth.default()
auth_req = google.auth.transport.requests.Request()
creds.refresh(auth_req)

# Pull workspace attributes
ws_attributes = requests.get(
    url=f"https://api.firecloud.org/api/workspaces/{ws_project}/{ws_name}?fields=workspace.attributes,workspace.authorizationDomain,workspace.googleProject,workspace.bucketName",
    headers={"Authorization": f"Bearer {creds.token}"}
).json()

# Map to schema
terra_dict = {}
terra_dict["studyName"] = ws_attributes["workspace"]["attributes"].get("library:projectName")
terra_dict["studyType"] = ws_attributes["workspace"]["attributes"].get("library:studyDesign")
#terra_dict["studyDescription"] = ws_attributes["workspace"]["attributes"].get("description")
terra_dict["dataTypes"] = ws_attributes["workspace"]["attributes"].get("library:dataCategory")["items"]
terra_dict["phenotypeIndication"] = ws_attributes["workspace"]["attributes"].get("library:indication")
terra_dict["species"] = "Homo sapiens"
terra_dict["piName"] = ws_attributes["workspace"]["attributes"].get("library:datasetOwner")
terra_dict["dataCustodianEmail"] = ws_attributes["workspace"]["attributes"].get("library:contactEmail")
if ws_attributes["workspace"]["attributes"].get("tag:tags"):
    for tag in ws_attributes["workspace"]["attributes"].get("tag:tags")["items"]:
        if "Consortium:" in tag:
            terra_dict["consortium"] = tag.split(":")[1].strip()
        elif "dbGaP:" in tag:
            terra_dict["dbGaPPhsID"] = tag.split(":")[1].strip()
terra_dict["consentGroups.consentCode"] = ws_attributes["workspace"]["attributes"]["library:dataUseRestriction"] 
terra_dict["consentGroups.fileTypes.fileType"] = ws_attributes["workspace"]["attributes"]["library:datatype"]["items"]

# View schema
print(terra_dict)


{'studyName': 'GREGoR', 'studyType': 'Case/Control, Cohort, Parent-Offspring Trios, Case Set', 'dataTypes': ['Aligned DNA short read sequencing data, Variant call format (VCFs) files for SNVs and Indels, Data tables with subject, family, clinical phenotype and experimental information'], 'phenotypeIndication': 'Mendelian Disorders', 'species': 'Homo sapiens', 'piName': 'Susanne May', 'dataCustodianEmail': 'gregorconsortium@uw.edu', 'consortium': 'GREGoR', 'dbGaPPhsID': 'phs003047', 'consentGroups.consentCode': 'HMB', 'consentGroups.fileTypes.fileType': ['Whole Exome, Whole Genome']}


In [5]:
ws_attributes

{'workspace': {'attributes': {'library:dulvn': 4,
   'library:studyDesign': 'Case/Control, Cohort, Parent-Offspring Trios, Case Set',
   'library:cohortCountry': 'United States',
   'description': "The NHGRI GREGoR (Genomics Research to Elucidate the Genetics of Rare Disease, https://gregorconsortium.org) Consortium was established in June 2021 with the goal of developing novel tools and approaches to advance the discovery of the genetic basis of rare conditions. Participant information and numerous types of molecular data are collected and generated by the  GREGoR Consortium.  This data is available on the NHGRI Analysis Visualization and Informatics Lab-space ([AnVIL](https://anvilproject.org/)) cloud platform via dbGaP application to [phs003047](http://www.ncbi.nlm.nih.gov/projects/gap/cgi-bin/study.cgi?study_id=phs003047.v1.p1).\n\nThe GREGoR Data Set conforms to the GREGoR Consortium [Data Model](https://github.com/UW-GAC/gregor_data_models), which is designed to support comprehen

# dbGaP XML Parse

In [24]:
# Parameters
phs_id = "phs003047"
#phs_id = "phs000693"

# Pull and parse XML
phs_short = phs_id.replace("phs", "")
dbgap_url = "https://dbgap.ncbi.nlm.nih.gov/ss/dbgapssws.cgi?request=Study&phs=" + phs_short
response = requests.get(url=dbgap_url)
xml_data = xmltodict.parse(response.text)

# Map to schema
dbgap_xml_dict = {}
if isinstance(xml_data["dbgapss"]["Study"], list):
    study_data = xml_data["dbgapss"]["Study"][0]
else:
    study_data = xml_data["dbgapss"]["Study"] 
dbgap_xml_dict["studyName"] = study_data["StudyInfo"].get("StudyNameEntrez")
dbgap_xml_dict["studyDescription"] = study_data["StudyInfo"].get("Description")
dbgap_xml_dict["dbGaPPhsID"] = phs_id
dbgap_xml_dict["dbGaPStudyRegistrationName"] = study_data["StudyInfo"].get("StudyNameEntrez")
for ap_entry in study_data["Authority"]["Persons"]["Person"]:
    if ap_entry["Role"] == "PI":
        dbgap_xml_dict["piName"] = ap_entry["@lname"] + ", " + ap_entry["@fname"]
        dbgap_xml_dict["piEmail"] = ap_entry["@email"]
        dbgap_xml_dict["piInstitution"] = ap_entry["Organization"]
    elif ap_entry["Role"] == "PO" and ap_entry["Organization"] == "NIH":
        dbgap_xml_dict["nihProgramOfficerName"] = ap_entry["@lname"] + ", " + ap_entry["@fname"]
    elif ap_entry["Role"] == "GPA" and ap_entry["Organization"] == "NIH":
        dbgap_xml_dict["nihGenomicProgramAdministratorName"] = ap_entry["@lname"] + ", " + ap_entry["@fname"]
ic_list = []
if isinstance(study_data["Authority"]["ICs"]["IC"], list):
    for ic_entry in study_data["Authority"]["ICs"]["IC"]:
        ic_list.append(ic_entry["@name"])
else:
    ic_list.append(study_data["Authority"]["ICs"]["IC"]["@name"])
dbgap_xml_dict["nihICsSupportingStudy"] = ic_list
dbgap_xml_dict["numberOfParticipants"] = study_data.get("@num_participants")
dbgap_xml_dict["embargoReleaseDate"] = study_data["Policy"].get("@pub-embargo")

# View schema
print(dbgap_xml_dict)


{'studyName': 'NHGRI GREGoR Consortium: Genomics Research to Elucidate the Genetics of Rare Disease', 'studyDescription': None, 'dbGaPPhsID': 'phs003047', 'dbGaPStudyRegistrationName': 'NHGRI GREGoR Consortium: Genomics Research to Elucidate the Genetics of Rare Disease', 'nihProgramOfficerName': 'Wellington, Christopher', 'nihGenomicProgramAdministratorName': 'Strasburger, Jennifer', 'nihICsSupportingStudy': ['NHGRI'], 'numberOfParticipants': '1000', 'embargoReleaseDate': 'none'}


In [23]:
study_data

{'@uid': '52672',
 '@whole_study_id': '8807',
 '@phs': '000693',
 '@v': '7',
 '@createDate': '2023-08-11T15:26:30-05:00',
 '@modDate': '2023-08-11T15:26:32-05:00',
 '@maxParentChildStudyModDate': '2023-08-11T15:26:32-05:00',
 '@handle': 'MendelianGenomics_UW',
 '@num_participants': '5000',
 'StudyInfo': {'@accession': 'phs000693.v7',
  '@parentAccession': 'phs000693.v7',
  'BioProject': [{'@id': 'PRJNA233538',
    '@entrez_id': '233538',
    '@type': 'bp_admin_access'},
   {'@id': 'PRJNA233539',
    '@entrez_id': '233539',
    '@type': 'bp_data_submission'}],
  'StudyNameEntrez': 'Center for Mendelian Genomics [CMG] - University of Washington Center for Mendelian Genomics',
  'StudyTypes': None,
  'StudyTypes2': {'@calculated': 'No',
   'StudyType21': [{'@name': 'phenotype_data', '@chosen': 'No'},
    {'@name': 'analysis', '@chosen': 'No'},
    {'@name': 'individual_sequencing', '@chosen': 'No'},
    {'@name': 'supporting_documents', '@chosen': 'No'},
    {'@name': 'images', '@chosen':

In [18]:
study_data

{'@uid': '41868',
 '@whole_study_id': '26521',
 '@phs': '001642',
 '@v': '2',
 '@p': '1',
 '@createDate': '2021-05-11T09:57:32-05:00',
 '@completedByGPADate': '2022-04-27T14:24:32-05:00',
 '@modDate': '2023-05-05T22:04:26-05:00',
 '@maxParentChildStudyModDate': '2023-05-05T22:04:26-05:00',
 '@handle': 'CCDG_IBD',
 '@num_participants': '8000',
 'StudyInfo': {'@accession': 'phs001642.v2.p1',
  '@parentAccession': 'phs001642.v2.p1',
  'BioProject': [{'@id': 'PRJNA477898',
    '@entrez_id': '477898',
    '@type': 'bp_admin_access'},
   {'@id': 'PRJNA477899',
    '@entrez_id': '477899',
    '@type': 'bp_data_submission'}],
  'StudyNameEntrez': 'Center for Common Disease Genomics [CCDG] - Inflammatory Bowel Disease (IBD) Genomes',
  'StudyTypes': None,
  'StudyTypes2': {'@calculated': 'No',
   'StudyType21': [{'@name': 'phenotype_data', '@chosen': 'No'},
    {'@name': 'analysis', '@chosen': 'No'},
    {'@name': 'individual_sequencing', '@chosen': 'No'},
    {'@name': 'supporting_documents', 

# dbGaP Study API

In [19]:
# Parameters
study_uid = 483191234

# Pull and parse JSON
dbgap_study_url = "https://submit.ncbi.nlm.nih.gov/dbgap/api/v1/study_config/" + str(study_uid)
response = requests.get(url=dbgap_study_url)
study_api_data = json.loads(response.text)

# Map to schema
dbgap_study_api_dict = {}
if study_api_data.get("error") == None:
    dbgap_study_api_dict["studyName"] = study_api_data["data"].get("report_name")
    dbgap_study_api_dict["studyDescription"] = study_api_data["data"].get("description")
    dbgap_study_api_dict["phenotypeIndication"] = study_api_data["data"].get("primary_disease")
    dbgap_study_api_dict["studyType"] = study_api_data["data"].get("study_design")
    for attr_entry in study_api_data["data"].get("attribution"):
        if attr_entry.get("title") == "Principal Investigator":
            dbgap_study_api_dict["piName"] = attr_entry.get("name")
            dbgap_study_api_dict["piInstitution"] = attr_entry.get("institute")
            break

# View schema
print(dbgap_study_api_dict)

{}


In [20]:
study_api_data

{'error': {'message': 'Invalid or non-existent study with study uid 483191234',
  'code': 1}}

# dbGaP FHIR API

In [14]:
# Parameters
#phs_id = "phs003047"
phs_id = "phs000693"

# Pull and parse JSON
dbgap_fhir_url = "https://dbgap-api.ncbi.nlm.nih.gov/fhir/x1/ResearchStudy?_format=json&_id=" + phs_id
response = requests.get(url=dbgap_fhir_url)
fhir_data = json.loads(response.text)

# Map to schema
dbgap_fhir_dict = {}
dbgap_fhir_dict["studyName"] = fhir_data["entry"][0]["resource"].get("title")
dbgap_fhir_dict["studyDescription"] = fhir_data["entry"][0]["resource"].get("description")
dbgap_fhir_dict["dbGaPPhsID"] = phs_id
dbgap_fhir_dict["dbGaPStudyRegistrationName"] = fhir_data["entry"][0]["resource"].get("title")
dbgap_fhir_dict["nihICsSupportingStudy"] = fhir_data["entry"][0]["resource"]["sponsor"].get("display")
# studyType
for cat_entry in fhir_data["entry"][0]["resource"].get("category"):
    for coding_entry in cat_entry.get("coding"):
        if coding_entry.get("system") == "https://dbgap-api.ncbi.nlm.nih.gov/fhir/x1/CodeSystem/ResearchStudy-StudyDesign":
            value = coding_entry.get("display") if coding_entry.get("display") else coding_entry.get("code")
            if dbgap_fhir_dict.get("studyType") and value:
                dbgap_fhir_dict["studyType"] += f", {value}"
            elif value:
                dbgap_fhir_dict["studyType"] = value
# dataTypes
dt_list = []
for ext_entry in fhir_data["entry"][0]["resource"].get("extension"):
    if ext_entry.get("url") == "https://dbgap-api.ncbi.nlm.nih.gov/fhir/x1/StructureDefinition/ResearchStudy-MolecularDataTypes":
        for inner_ext_entry in ext_entry.get("extension"):
            if inner_ext_entry.get("url") == "https://dbgap-api.ncbi.nlm.nih.gov/fhir/x1/StructureDefinition/ResearchStudy-MolecularDataTypes-MolecularDataType":
                for coding_entry in inner_ext_entry["valueCodeableConcept"].get("coding"):
                    dt_list.append(coding_entry.get("code"))
dbgap_fhir_dict["dataTypes"] = dt_list
# phenotypeIndication
for focus_entry in fhir_data["entry"][0]["resource"].get("focus"):
    for coding_entry in focus_entry.get("coding"):
        value = coding_entry.get("display") if coding_entry.get("display") else coding_entry.get("code")
        if dbgap_fhir_dict.get("phenotypeIndication") and value:
            dbgap_fhir_dict["phenotypeIndication"] += f", {value}"
        elif value:
            dbgap_fhir_dict["phenotypeIndication"] = value
# numberOfParticipants
for ext_entry in fhir_data["entry"][0]["resource"].get("extension"):
    if ext_entry.get("url") == "https://dbgap-api.ncbi.nlm.nih.gov/fhir/x1/StructureDefinition/ResearchStudy-Content":
        for inner_ext_entry in ext_entry.get("extension"):
            if inner_ext_entry.get("url") == "https://dbgap-api.ncbi.nlm.nih.gov/fhir/x1/StructureDefinition/ResearchStudy-Content-NumSubjects":
                dbgap_fhir_dict["numberOfParticipants"] = inner_ext_entry["valueCount"].get("code")

# View schema
print(dbgap_fhir_dict)

{'studyName': 'University of Washington Center for Mendelian Genomics (UW-CMG)', 'studyDescription': '\nThe Centers for Mendelian Genomics project uses next-generation sequencing and computational approaches to discover the genes and variants that underlie Mendelian conditions. By discovering genes that cause Mendelian conditions, we will expand our understanding of their biology to facilitate diagnosis and new treatments.\n', 'dbGaPPhsID': 'phs000693', 'dbGaPStudyRegistrationName': 'University of Washington Center for Mendelian Genomics (UW-CMG)', 'nihICsSupportingStudy': 'National Human Genome Research Institute', 'studyType': 'Mendelian', 'dataTypes': ['SNP Genotypes (NGS)', 'SNP/CNV Genotypes (NGS)'], 'phenotypeIndication': 'Rare Diseases', 'numberOfParticipants': '1'}


In [15]:
fhir_data

{'resourceType': 'Bundle',
 'id': '37fafb4c-88e7-4afc-8a1c-e2229ba94055',
 'meta': {'lastUpdated': '2023-10-10T10:00:06.464-04:00'},
 'type': 'searchset',
 'total': 1,
 'link': [{'relation': 'self',
   'url': 'https://dbgap-api.ncbi.nlm.nih.gov/fhir/x1/ResearchStudy?_format=json&_id=phs000693'}],
 'entry': [{'fullUrl': 'https://dbgap-api.ncbi.nlm.nih.gov/fhir/x1/ResearchStudy/phs000693',
   'resource': {'resourceType': 'ResearchStudy',
    'id': 'phs000693',
    'meta': {'versionId': '1',
     'lastUpdated': '2022-02-14T02:01:18.881-05:00',
     'source': '#z4zdXYRpgdryBD2q',
     'security': [{'system': 'https://dbgap-api.ncbi.nlm.nih.gov/fhir/x1/CodeSystem/DbGaPConcept-SecurityStudyConsent',
       'code': 'public',
       'display': 'public'}]},
    'extension': [{'url': 'https://dbgap-api.ncbi.nlm.nih.gov/fhir/x1/StructureDefinition/ResearchStudy-StudyOverviewUrl',
      'valueUrl': 'https://www.ncbi.nlm.nih.gov/projects/gap/cgi-bin/study.cgi?study_id=phs000693.v6.p2'},
     {'url'

# Scratch

In [None]:
# Description clean-up from Dave
function getStudyDescription(resource: ResearchStudy): string {
  if (resource) {
    const rawDescription = resource.description;
    if (rawDescription) {
      /* Replace any `\n\n\t` with space to avoid unwanted line breaks
      /* Replace any `\t` (tab) with a space - avoids markdown processing tab as <pre/>. */
      /* Replace any dbGap internal links with an external link to the dbGap study. */
      const parsedDescription = rawDescription
        .replace(/\n\n\t/g, " ")
        .replace(/\t/g, " ")
        .replace(
          /study.cgi\?study_id=|.\/study.cgi\?study_id=/g,
          "https://www.ncbi.nlm.nih.gov/projects/gap/cgi-bin/study.cgi?study_id="
        );
      return markdownToHTML(parsedDescription);
    }
  }
  return "";
}