In [None]:
# Version History
#print("Version 1.0.0: 09/23/2022 5:06pm - Nate Calvanese - First version created")
#print("Version 1.0.1: 09/26/2022 11:18m - Nate Calvanese - Fixed bug in default dataset naming")
#print("Version 1.0.2: 09/27/2022 2:43pm - Nate Calvanese - Added ability to aggregate multiple workspaces into one dataset")
#print("Version 1.0.3: 10/5/2022 1:32pm - Nate Calvanese - Added support for chunking up ingest requests")
#print("Version 1.0.4: 10/6/2022 10:35am - Nate Calvanese - Updated use of TDR utility functions")
#print("Version 1.0.5: 10/13/2022 10:54am - Nate Calvanese - Parameter tweaks for latest changes")
#print("Version 1.0.6: 10/21/2022 10:53am - Nate Calvanese - Version stamp for latest changes to supporting notebooks")
#print("Version 1.0.7: 10/24/2022 4:58pm - Nate Calvanese - Added support for project entity name derivation")
#print("Version 1.0.8: 10/26/2022 4:24pm - Nate Calvanese - Added support for batching mapping activities in section 3")
#print('Version 1.0.9: 2/21/2023 2:50pm - Nate Calvanese - Added support for $BQ_DATASET substitution variable in mapping section')
#print('Version 1.0.10: 3/8/2023 8:17am - Nate Calvanese - Performance improvements')
#print('Version 1.0.11: 7/11/2023 8:17am - Nate Calvanese - Added auth domain back as reader on snapshots')
#print('Version 1.0.12: 9/1/2023 10:16am - Nate Calvanese - Added functionality to enable/disable secure monitoring for public datasets.')
#print('Version 1.0.13: 12/15/2023 9:00am - Nate Calvanese - Added functionality to optionally truncate tables before ingest')
print('Version 1.0.14: 1/12/2024 11:28am - Nate Calvanese - Added max_combined_rec_ref_size as a global parameter')


# Imports and Common Variables

In [None]:
# Install additional modules (one time effort per cloud environment)
!pip install --upgrade pip import_ipynb data_repo_client urllib3 xmltodict azure-storage-blob
# !pip install data_repo_client==1.409.0

In [1]:
# Workspace environment variables
import os
import re
print("Recording workspace environment variables:")
ws_name = os.environ["WORKSPACE_NAME"]
ws_project = os.environ["WORKSPACE_NAMESPACE"]
ws_bucket = os.environ["WORKSPACE_BUCKET"]
ws_bucket_name = re.sub('^gs://', '', ws_bucket)
print(f"Workspace name = {ws_name}")
print(f"Workspace project = {ws_project}")
print(f"Workspace bucket = {ws_bucket}")
print(f"Workspace bucket name = {ws_bucket_name}")

# Copy latest version of the pipeline notebooks to the cloud environment (uncomment if any notebooks have changed since last run)
# print("\nCopying latest pipeline notebooks to the cloud environment:")
# !gsutil -m cp $ws_bucket/notebooks/*.ipynb .

# Additional imports
print("\nRunning imports:")
import import_ipynb
import pandas as pd
from firecloud import api as fapi
import data_repo_client
import ingest_pipeline_utilities as utils
import build_mapping_query as bmq
from google.cloud import storage
from google.cloud import bigquery
import google.auth
import google.auth.transport.requests
import logging
import datetime
import json
import sys
from time import sleep
import requests
from io import BytesIO
import pyarrow.parquet as pq
from azure.storage.blob import BlobClient, ContainerClient

# Common pipeline variables (AnVIL)
ws_attributes = utils.get_workspace_attributes(ws_project, ws_name)
params = {}
params["ws_name"] = ws_name
params["ws_project"] = ws_project
params["ws_bucket"] = ws_bucket
params["ws_bucket_name"] = ws_bucket_name
params["profile_id"] = "e0e03e48-5b96-45ec-baa4-8cc1ebf74c61" 
params["google_project"] = ws_attributes["googleProject"]
params["create_file_table"] = True
params["file_table_name"] = "file_inventory"
params["ingest_user_to_add"] = "tdr_sa"  # tdr_sa or anvil_tdr_ingest
params["global_file_exclusions"] = ["SubsetHailJointCall", ".vds/", "ingest_ignore"]
params["max_combined_rec_ref_size"] = 40000

# Configure logging format
while logging.root.handlers:
    logging.root.removeHandler(logging.root.handlers[-1])
logging.basicConfig(format="%(asctime)s - %(levelname)s: %(message)s", datefmt="%m/%d/%Y %I:%M:%S %p", level=logging.INFO, handlers=[logging.StreamHandler(sys.stdout)])
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option("display.max_colwidth", None)
pd.set_option('display.width', 1000)
pd.set_option('display.colheader_justify', 'center')
pd.set_option('display.precision', 3)


Recording workspace environment variables:
Workspace name = anvil_workspace_ingest_resources_dev
Workspace project = dsp-data-ingest
Workspace bucket = gs://fc-2a9eefc3-0302-427f-9ac3-82f078741c03
Workspace bucket name = fc-2a9eefc3-0302-427f-9ac3-82f078741c03

Running imports:
importing Jupyter notebook from ingest_pipeline_utilities.ipynb
Version 1.0.45: 10/18/2024 2:09pm - Nate Calvanese - Fixed performance bug with find_and_add_fileref_fields function.
importing Jupyter notebook from source_files_creation.ipynb
Version 1.0.9: 2/25/2023 3:15pm - Nate Calvanese - Replaced FAPI with utils functions
importing Jupyter notebook from build_file_inventory.ipynb
Version 2.0.4: 10/18/2024 2:19pm - Nate Calvanese - Updated get_objects_list function to not use fuzzy matching for full file paths
importing Jupyter notebook from process_table_data.ipynb
Version: 1.0.10: 1/12/2024 11:25am - Nate Calvanese - Made max_combined_rec_ref_size configurable
importing Jupyter notebook from build_mapping_q

# "EL" Pipeline: Load Dataset to TDR in Source Format

## Pipeline Run Variables

In [2]:
## >>> Run Variables <<<
# For datasets split across multiple workspaces, set the staging area and target TDR dataset to the 
# same value to collect all of the source data and process it together.
workspace_run_list = [
    #["Workspace_Name", "Workspace_Project", Public (True/False), "Staging Area (Leave empty for default)", "Target_TDR_Dataset_Name (Leave empty for default)", Run (True/False)]
#     ["ANVIL_Workspace_1", "anvil-datastorage", False, "", "", False],
#     ["ANVIL_Workspace_2", "anvil-datastorage", False, "", "", False],
    ['AnVIL_CCDG_Broad_CVD_EOCAD_PROMIS_WGS', 'anvil-datastorage', False, '', '', True],
    ['AnVIL_CCDG_Broad_CVD_PROMIS_GRU_WES', 'anvil-datastorage', False, '', '', True],
]
params["skip_source_files_creation"] = False
params["skip_file_inventory_creation"] = False
params["skip_table_data_processing"] = False
params["skip_ingests"] = False
params["trunc_before_ingest"] = True
params["skip_snapshot_creation"] = True
params["snapshot_readers_list"] = ["auth-domain"] # Include "auth-domain" to add the auth domain(s) as a reader (if one exists)


## >>> File Inventory Variables <<<
# The GCS bucket associated with the source workspace will be automatically included in the file inventory build. To specify 
# additional GCS buckets to include in the file inventory build, add entries to the below dictionary.
params["additional_file_inventory_sources"] = {}
# EXAMPLE:
# params["additional_file_inventory_sources"] = {
#     "staging_area": {
#         "bucket_name": {
#             "include_dirs": [], # Leave empty to include all directories in bucket
#             "exclude_dirs": [] # Exclusions will take precedence over inclusions
#         }
#     }
# }


## >>> Ingest Variables <<<
# For cases where you only want to ingest a subset of files, use the below dictionary to specify exactly what should be ingested.
params["ingest_list_override"] = {
}
# EXAMPLE:
# params["ingest_list_override"] = {
#     "ws_table": ["ws_table_0.json"], # Leave empty to run ingest for every file for target table
# }


## >>> File Reference Variables <<<
# Fields containing GCS links will be identified automatically by the pipeline. The below dict should contain any fields
# that contain file references that aren't proper GCS links in the workspace tables.
data_file_refs_dict = {   
}
# Definitions:
#    Required Fields: column, method, mode, create_new field
#    Optional Fields: match_multiple_files (default to True), match_regex (default to None), match_type (default to 'partial'), new_field_name (default to None)
#    Methods: 
#       file_path_match -- Field contains a full or partial file path, which can be matched to the file inventory to grab the file(s) referenced 
#       tdr_file_id -- Field contains file UUIDs of files already ingested into the target TDR dataset
#    Modes:
#       fileref_in_line -- Populates the field with a file reference object
#       fileref_table_ref -- Populates the field with an ID that joins to a file table. If no file table built, falls back on fileref_in_line logic.
    
#-----------------------------------------------------------------------------------------------------------#
    
# Print variables
print("Pipeline run variables set:")
print("Profile ID: " + params["profile_id"])
print("Ingests to run: ")
current_datetime = datetime.datetime.now()
current_date_string = current_datetime.strftime("%Y%m%d")
for workspace in workspace_run_list:
    if workspace[5] == True:
        ws_attributes = utils.get_workspace_attributes(workspace[1], workspace[0])
        params["phs_id"] = utils.format_phs_id(ws_attributes["attributes"]["phs_id"]) if ws_attributes["attributes"].get("phs_id") else ""
        auth_list = ws_attributes["authorizationDomain"] if ws_attributes.get("authorizationDomain") else []
        params["auth_domains"] = [x["membersGroupName"] for x in auth_list]
        params["consent_name"] = ws_attributes["attributes"]["library:dataUseRestriction"] if ws_attributes["attributes"].get("library:dataUseRestriction") else ""
        if not params["consent_name"]:
            ws_tags = ws_attributes["attributes"].get("tag:tags")
            if ws_tags:
                for ws_tag in ws_tags:
                    if "consent_code:" in ws_tag:
                        params["consent_name"] = ws_tag.replace("consent_code:", "").strip()
                        break
        params["data_files_src_bucket"] = ws_attributes["bucketName"] if ws_attributes.get("bucketName") else ""
        params["public_dataset"] = workspace[2]
        workspace[4] = workspace[4] if workspace[4] else utils.format_dataset_name(workspace[0])
        workspace[3] = workspace[3] if workspace[3] else workspace[0]
        print("- Workspace [" + workspace[1] + "/" + workspace[0] + "] to TDR dataset [" + workspace[4] + "] via Staging Area [" + workspace[3] + "]")
        print("\t- PHS ID = " + params["phs_id"])
        print("\t- Consent Short Name = " + params["consent_name"])
        print("\t- Auth Domains = " + str(params["auth_domains"]))
        print("\t- Public Dataset = " + str(params["public_dataset"]))
        print("\t- Data Files Source Bucket = " + params["data_files_src_bucket"])
print("Skip source files creation? " + str(params["skip_source_files_creation"]))
print("Skip file inventory creation? " + str(params["skip_file_inventory_creation"]))
print("Skip table data processing? " + str(params["skip_table_data_processing"]))
print("Skip ingests? " + str(params["skip_ingests"]))
print("Truncate tables before ingest? " + str(params["trunc_before_ingest"]))
print("Ingest override list: " + str(params["ingest_list_override"]))
print("Skip snapshot creation? " + str(params["skip_snapshot_creation"]))


Pipeline run variables set:
Profile ID: e0e03e48-5b96-45ec-baa4-8cc1ebf74c61
Ingests to run: 
- Workspace [anvil-datastorage/AnVIL_CCDG_Broad_CVD_EOCAD_PROMIS_WGS] to TDR dataset [ANVIL_CCDG_Broad_CVD_EOCAD_PROMIS_WGS_20241023] via Staging Area [AnVIL_CCDG_Broad_CVD_EOCAD_PROMIS_WGS]
	- PHS ID = phs001569
	- Consent Short Name = GRU
	- Auth Domains = ['AUTH_ANVIL_CCDG_Broad_CVD_EOCAD_PROMIS_WGS']
	- Public Dataset = False
	- Data Files Source Bucket = fc-secure-cbf1f8fb-8185-46c9-9034-63073cbe7be7
- Workspace [anvil-datastorage/AnVIL_CCDG_Broad_CVD_PROMIS_GRU_WES] to TDR dataset [ANVIL_CCDG_Broad_CVD_PROMIS_GRU_WES_20241023] via Staging Area [AnVIL_CCDG_Broad_CVD_PROMIS_GRU_WES]
	- PHS ID = phs001569
	- Consent Short Name = GRU
	- Auth Domains = ['AUTH_AnVIL_CCDG_Broad_CVD_PROMIS_GRU_WES']
	- Public Dataset = False
	- Data Files Source Bucket = fc-secure-d8de1fe3-972d-480f-a8a8-2bbc251add30
Skip source files creation? False
Skip file inventory creation? False
Skip table data processing

## Pipeline Execution

In [3]:
# Loop through and execute workspace connector pipeline ("E") for listed workspaces
if params["skip_source_files_creation"] == True:
    logging.info("Skipping source file creation, per user request.")
else:
    for workspace in workspace_run_list:
        if workspace[5] == True:
            params["data_file_refs"] = data_file_refs_dict  
            utils.run_ws_connector_pipeline(workspace, params)

# Aggregate staging area to target dataset combinations, loop through them, and execute ingest pipeline ("L")
pipeline_run_list = []
for workspace in workspace_run_list:
    if workspace[5] == True:
        temp_list = [workspace[3], workspace[4], workspace[2]]
        if temp_list not in pipeline_run_list:
            pipeline_run_list.append(temp_list)
for pipeline in pipeline_run_list:
    utils.run_el_pipeline(pipeline, params)


10/23/2024 01:24:40 PM - INFO: Starting Workspace Connector Pipeline for AnVIL_CCDG_Broad_CVD_EOCAD_PROMIS_WGS.
10/23/2024 01:24:40 PM - INFO: Creating or updating provenance.json file for Staging Area: AnVIL_CCDG_Broad_CVD_EOCAD_PROMIS_WGS
10/23/2024 01:24:41 PM - INFO: Unable to retrieve provenance.json file. Creating new provenance.json file.
10/23/2024 01:24:44 PM - INFO: Additional file reference fields found and marked for processing: qc_result_sample.cram, sample.crai_path, sample.md5_path, sample.cram_path
10/23/2024 01:24:57 PM - INFO: Running source files creation.
10/23/2024 01:24:57 PM - INFO: List of entity tables in current workspace: participant, subject, qc_result_sample, sample_set, sample
10/23/2024 01:24:57 PM - INFO: Starting download of tsv file for participant table.
10/23/2024 01:24:58 PM - INFO: Copying participant_AnVIL_CCDG_Broad_CVD_EOCAD_PROMIS_WGS.tsv to gs://fc-2a9eefc3-0302-427f-9ac3-82f078741c03/ingest_pipeline/input/AnVIL_CCDG_Broad_CVD_EOCAD_PROMIS_WGS

Unnamed: 0,entity_type,tsv_file_count,data_model_count,record_count_validation
0,participant,1136,1136,Passed
1,subject,1136,1136,Passed
2,qc_result_sample,1135,1135,Passed
3,sample_set,1,1,Passed
4,sample,1136,1136,Passed


10/23/2024 01:25:23 PM - INFO: The Workspace Connector Pipeline has completed for AnVIL_CCDG_Broad_CVD_EOCAD_PROMIS_WGS.
10/23/2024 01:25:31 PM - INFO: Pipeline Results:


Unnamed: 0,Workspace,Staging Area,Time,Step,Status,Message
0,AnVIL_CCDG_Broad_CVD_EOCAD_PROMIS_WGS,AnVIL_CCDG_Broad_CVD_EOCAD_PROMIS_WGS,2024-10-23 13:24:57,Create or Update Staging Area Provenance,Success,"{""phs_id"": ""phs001569"", ""consent_name"": ""GRU"", ""source_workspaces"": [""AnVIL_CCDG_Broad_CVD_EOCAD_PROMIS_WGS""], ""auth_domains"": [""AUTH_ANVIL_CCDG_Broad_CVD_EOCAD_PROMIS_WGS""], ""data_files_src_buckets"": {""fc-secure-cbf1f8fb-8185-46c9-9034-63073cbe7be7"": {""include_dirs"": [], ""exclude_dirs"": []}}, ""data_file_refs"": {""qc_result_sample"": [{""column"": ""cram"", ""method"": ""file_path_match"", ""match_multiple_files"": false, ""match_regex"": null, ""match_type"": ""exact"", ""mode"": ""fileref_in_line"", ""create_new_field"": false, ""new_field_name"": null}], ""sample"": [{""column"": ""crai_path"", ""method"": ""file_path_match"", ""match_multiple_files"": false, ""match_regex"": null, ""match_type"": ""exact"", ""mode"": ""fileref_in_line"", ""create_new_field"": false, ""new_field_name"": null}, {""column"": ""md5_path"", ""method"": ""file_path_match"", ""match_multiple_files"": false, ""match_regex"": null, ""match_type"": ""exact"", ""mode"": ""fileref_in_line"", ""create_new_field"": false, ""new_field_name"": null}, {""column"": ""cram_path"", ""method"": ""fil"
1,AnVIL_CCDG_Broad_CVD_EOCAD_PROMIS_WGS,AnVIL_CCDG_Broad_CVD_EOCAD_PROMIS_WGS,2024-10-23 13:25:23,Create Source Files,Success,"[{""entity_type"": ""participant"", ""tsv_file_count"": ""1136"", ""data_model_count"": 1136, ""record_count_validation"": ""Passed""}, {""entity_type"": ""subject"", ""tsv_file_count"": ""1136"", ""data_model_count"": 1136, ""record_count_validation"": ""Passed""}, {""entity_type"": ""qc_result_sample"", ""tsv_file_count"": ""1135"", ""data_model_count"": 1135, ""record_count_validation"": ""Passed""}, {""entity_type"": ""sample_set"", ""tsv_file_count"": ""1"", ""data_model_count"": 1, ""record_count_validation"": ""Passed""}, {""entity_type"": ""sample"", ""tsv_file_count"": ""1136"", ""data_model_count"": 1136, ""record_count_validation"": ""Passed""}]"


10/23/2024 01:25:32 PM - INFO: Starting Workspace Connector Pipeline for AnVIL_CCDG_Broad_CVD_PROMIS_GRU_WES.
10/23/2024 01:25:32 PM - INFO: Creating or updating provenance.json file for Staging Area: AnVIL_CCDG_Broad_CVD_PROMIS_GRU_WES
10/23/2024 01:25:32 PM - INFO: Existing provenance.json file found. Updating with new information.
10/23/2024 01:26:07 PM - INFO: Running source files creation.
10/23/2024 01:26:07 PM - INFO: List of entity tables in current workspace: participant, sample, sample_set, subject
10/23/2024 01:26:07 PM - INFO: Starting download of tsv file for participant table.
10/23/2024 01:26:08 PM - INFO: Copying participant_AnVIL_CCDG_Broad_CVD_PROMIS_GRU_WES.tsv to gs://fc-2a9eefc3-0302-427f-9ac3-82f078741c03/ingest_pipeline/input/AnVIL_CCDG_Broad_CVD_PROMIS_GRU_WES/table_data/participant
10/23/2024 01:26:12 PM - INFO: Starting download of tsv file for sample table.
10/23/2024 01:26:36 PM - INFO: Copying sample_AnVIL_CCDG_Broad_CVD_PROMIS_GRU_WES.tsv to gs://fc-2a9eef

Unnamed: 0,entity_type,tsv_file_count,data_model_count,record_count_validation
0,participant,16808,16808,Passed
1,sample,16808,16808,Passed
2,sample_set,1,1,Passed
3,subject,16808,16808,Passed


10/23/2024 01:27:01 PM - INFO: The Workspace Connector Pipeline has completed for AnVIL_CCDG_Broad_CVD_PROMIS_GRU_WES.
10/23/2024 01:27:09 PM - INFO: Pipeline Results:


Unnamed: 0,Workspace,Staging Area,Time,Step,Status,Message
0,AnVIL_CCDG_Broad_CVD_PROMIS_GRU_WES,AnVIL_CCDG_Broad_CVD_PROMIS_GRU_WES,2024-10-23 13:26:07,Create or Update Staging Area Provenance,Success,"{""phs_id"": ""phs001569"", ""consent_name"": ""GRU"", ""source_workspaces"": [""AnVIL_CCDG_Broad_CVD_PROMIS_GRU_WES""], ""auth_domains"": [""AUTH_AnVIL_CCDG_Broad_CVD_PROMIS_GRU_WES""], ""data_files_src_buckets"": {""fc-secure-d8de1fe3-972d-480f-a8a8-2bbc251add30"": {""include_dirs"": [], ""exclude_dirs"": []}}, ""data_file_refs"": {""qc_result_sample"": [{""column"": ""cram"", ""method"": ""file_path_match"", ""match_multiple_files"": false, ""match_regex"": null, ""match_type"": ""exact"", ""mode"": ""fileref_in_line"", ""create_new_field"": false, ""new_field_name"": null}], ""sample"": [{""column"": ""crai_path"", ""method"": ""file_path_match"", ""match_multiple_files"": false, ""match_regex"": null, ""match_type"": ""exact"", ""mode"": ""fileref_in_line"", ""create_new_field"": false, ""new_field_name"": null}, {""column"": ""md5_path"", ""method"": ""file_path_match"", ""match_multiple_files"": false, ""match_regex"": null, ""match_type"": ""exact"", ""mode"": ""fileref_in_line"", ""create_new_field"": false, ""new_field_name"": null}, {""column"": ""cram_path"", ""method"": ""file_pa"
1,AnVIL_CCDG_Broad_CVD_PROMIS_GRU_WES,AnVIL_CCDG_Broad_CVD_PROMIS_GRU_WES,2024-10-23 13:27:01,Create Source Files,Success,"[{""entity_type"": ""participant"", ""tsv_file_count"": ""16808"", ""data_model_count"": 16808, ""record_count_validation"": ""Passed""}, {""entity_type"": ""sample"", ""tsv_file_count"": ""16808"", ""data_model_count"": 16808, ""record_count_validation"": ""Passed""}, {""entity_type"": ""sample_set"", ""tsv_file_count"": ""1"", ""data_model_count"": 1, ""record_count_validation"": ""Passed""}, {""entity_type"": ""subject"", ""tsv_file_count"": ""16808"", ""data_model_count"": 16808, ""record_count_validation"": ""Passed""}]"


10/23/2024 01:27:09 PM - INFO: Starting Extract and Load (EL) Pipeline for AnVIL_CCDG_Broad_CVD_EOCAD_PROMIS_WGS.
10/23/2024 01:27:10 PM - INFO: Building file inventory.
10/23/2024 01:29:15 PM - INFO: Recording inventory entries from fc-secure-cbf1f8fb-8185-46c9-9034-63073cbe7be7 (975003 objects total)
10/23/2024 01:54:15 PM - INFO: 97500 files recorded (~10%)
10/23/2024 01:55:19 PM - INFO: 195000 files recorded (~20%)
10/23/2024 01:56:22 PM - INFO: 292500 files recorded (~30%)
10/23/2024 01:57:30 PM - INFO: 390000 files recorded (~40%)
10/23/2024 01:58:34 PM - INFO: 487500 files recorded (~50%)
10/23/2024 01:59:39 PM - INFO: 585000 files recorded (~60%)
10/23/2024 02:00:43 PM - INFO: 682500 files recorded (~70%)
10/23/2024 02:01:48 PM - INFO: 780000 files recorded (~80%)
10/23/2024 02:02:51 PM - INFO: 877500 files recorded (~90%)
10/23/2024 02:03:57 PM - INFO: 975000 files recorded (~100%)
10/23/2024 02:03:57 PM - INFO: All inventory entries recorded (975003 objects total).
10/23/2024

10/23/2024 02:16:26 PM - INFO: Running ingests for target table: sample
10/23/2024 02:16:26 PM - INFO: Checking for file ingest_pipeline/output/source/AnVIL_CCDG_Broad_CVD_EOCAD_PROMIS_WGS/table_data/sample/sample_0.json...
10/23/2024 02:16:26 PM - INFO: Running ingest from sample_0.json to table sample.
TDR Job ID: SIqy24JyTQKASlhTXixIKw
10/23/2024 02:19:09 PM - INFO: Ingest from file sample_0.json succeeded: Job succeeded, but error retrieving job result: (500)
Reason: Internal Server Error
HTTP response headers: HTTPHeaderDict({'Date': 'Wed, 23 Oct 2024 14:18:58 GMT', 'Server': 'Apache', 'X-Frame-Options': 'SAMEORIGIN', 'Access-Control-Allow-Headers': 'DNT,User-Agent,X-Requested-With,If-Modified-Since,Cache-Control,Content-Type,Range,Authorization,Accept,Referer,X-App-Id,Origin', 'Access-Control-Allow-Methods': 'GET,POST,DELETE,PUT,PATCH,OPTIONS,HEAD', 'X-Content-Type-Options': 'nosniff', 'Strict-Transport-Security': 'max-age=31536000;includeSubDomains', 'Cache-Control': 'no-cache,n

Unnamed: 0,Staging Area,Time,Step,Task,Status,Message
0,AnVIL_CCDG_Broad_CVD_EOCAD_PROMIS_WGS,2024-10-23 13:27:10,Initialization,Provenance File Retrieval,Success,"{""phs_id"": ""phs001569"", ""consent_name"": ""GRU"", ""source_workspaces"": [""AnVIL_CCDG_Broad_CVD_EOCAD_PROMIS_WGS""], ""auth_domains"": [""AUTH_ANVIL_CCDG_Broad_CVD_EOCAD_PROMIS_WGS""], ""data_files_src_buckets"": {""fc-secure-cbf1f8fb-8185-46c9-9034-63073cbe7be7"": {""include_dirs"": [], ""exclude_dirs"": []}}, ""data_file_refs"": {""qc_result_sample"": [{""column"": ""cram"", ""method"": ""file_path_match"", ""match_multiple_files"": false, ""match_regex"": null, ""match_type"": ""exact"", ""mode"": ""fileref_in_line"", ""create_new_field"": false, ""new_field_name"": null}], ""sample"": [{""column"": ""crai_path"", ""method"": ""file_path_match"", ""match_multiple_files"": false, ""match_regex"": null, ""match_type"": ""exact"", ""mode"": ""fileref_in_line"", ""create_new_field"": false, ""new_field_name"": null}, {""column"": ""md5_path"", ""method"": ""file_path_match"", ""match_multiple_files"": false, ""match_regex"": null, ""match_type"": ""exact"", ""mode"": ""fileref_in_line"", ""create_new_field"": false, ""new_field_name"": null}, {""column"": ""cram_path"", ""method"": ""fil"
1,AnVIL_CCDG_Broad_CVD_EOCAD_PROMIS_WGS,2024-10-23 14:04:03,File Inventory Creation,Build File Inventory,Success,3408 files found
2,AnVIL_CCDG_Broad_CVD_EOCAD_PROMIS_WGS,2024-10-23 14:04:40,Table Data Processing,Ingest Pre-Processing,Success,"{""file_inventory_population"": ""File inventory populated"", ""participant"": ""No errors raised"", ""qc_result_sample"": ""No errors raised"", ""sample"": ""No errors raised"", ""sample_set"": ""No errors raised"", ""subject"": ""No errors raised"", ""workspace_attributes"": ""No errors raised"", ""file_inventory"": ""No errors raised""}"
3,AnVIL_CCDG_Broad_CVD_EOCAD_PROMIS_WGS,2024-10-23 14:04:41,Dataset Creation or Retrieval,Enumerate Datasets,Success,0 datasets found. Matching dataset_id =
4,AnVIL_CCDG_Broad_CVD_EOCAD_PROMIS_WGS,2024-10-23 14:05:42,Dataset Creation or Retrieval,Create New Dataset,Success,"Job_ID: B75adRljT92kw_bc4osCQQ - Truncated Response: {'id': 'ecd0e3b1-a177-4487-8e33-0084688cf148', 'name': 'ANVIL_CCDG_Broad_CVD_EOCAD_PROMIS_WGS_20241023', 'description': 'TDR Dataset for AnVIL_CCDG_Broad_CVD_EOCAD_PROMIS_WGS', 'defaultProfileId': 'e0e03e48-5b96-45ec-baa4-8cc1ebf74c61', 'createdDate': '2024-10-23T14:05:21.370245Z', 'storage': [{'region': 'us-central1', 'cloudResource': 'bigquery', 'cloudPlatform': 'gcp'}, {'region': 'us-east4', 'cloudResource': 'firestore', 'cloudPlatform': 'gcp'}, {'region': 'us-central1', 'cloudResource': 'bucket', 'cloudPlatform': 'gcp'}], 'secureMonitoringEnabled': True, 'cloudPlatform': 'gcp', 'dataProject': 'datarepo-561d6a93', 'storageAccount': None, 'phsId': 'phs001569', 'selfHosted': True, 'predictableFileIds': True, 'tags': [], 'resourceLocks': {'exclusive': None, 'shared': []}}"
5,AnVIL_CCDG_Broad_CVD_EOCAD_PROMIS_WGS,2024-10-23 14:06:35,Dataset Service Account Setup,Add SA to Anvil Ingest Group,Success,tdr-ingest-sa@datarepo-561d6a93.iam.gserviceaccount.com
6,AnVIL_CCDG_Broad_CVD_EOCAD_PROMIS_WGS,2024-10-23 14:06:44,Dataset Service Account Setup,Add Ingest User to Workspace anvil_workspace_ingest_resources_dev,Success,tdr-ingest-sa@datarepo-561d6a93.iam.gserviceaccount.com
7,AnVIL_CCDG_Broad_CVD_EOCAD_PROMIS_WGS,2024-10-23 14:06:50,Dataset Service Account Setup,Add Ingest User to Workspace AnVIL_CCDG_Broad_CVD_EOCAD_PROMIS_WGS,Success,tdr-ingest-sa@datarepo-561d6a93.iam.gserviceaccount.com
8,AnVIL_CCDG_Broad_CVD_EOCAD_PROMIS_WGS,2024-10-23 14:14:34,Dataset Ingests,Table: participant - File: participant_0.json,Success,"Job_ID: R3Snd0DVRyyhsdpCUg3Weg - Truncated Response: {'dataset_id': 'ecd0e3b1-a177-4487-8e33-0084688cf148', 'dataset': 'ANVIL_CCDG_Broad_CVD_EOCAD_PROMIS_WGS_20241023', 'table': 'participant', 'path': 'gs://fc-2a9eefc3-0302-427f-9ac3-82f078741c03/ingest_pipeline/output/source/AnVIL_CCDG_Broad_CVD_EOCAD_PROMIS_WGS/table_data/participant/participant_0.json', 'load_tag': 'Ingest for AnVIL_CCDG_Broad_CVD_EOCAD_PROMIS_WGS', 'row_count': 1136, 'bad_row_count': 0, 'load_result': None}"
9,AnVIL_CCDG_Broad_CVD_EOCAD_PROMIS_WGS,2024-10-23 14:16:26,Dataset Ingests,Table: qc_result_sample - File: qc_result_sample_0.json,Success,"Job_ID: thlRcNzFSXm-nYZvXsJk6Q - Truncated Response: Job succeeded, but error retrieving job result: (500)\nReason: Internal Server Error\nHTTP response headers: HTTPHeaderDict({'Date': 'Wed, 23 Oct 2024 14:16:16 GMT', 'Server': 'Apache', 'X-Frame-Options': 'SAMEORIGIN', 'Access-Control-Allow-Headers': 'DNT,User-Agent,X-Requested-With,If-Modified-Since,Cache-Control,Content-Type,Range,Authorization,Accept,Referer,X-App-Id,Origin', 'Access-Control-Allow-Methods': 'GET,POST,DELETE,PUT,PATCH,OPTIONS,HEAD', 'X-Content-Type-Options': 'nosniff', 'Strict-Transport-Security': 'max-age=31536000;includeSubDomains', 'Cache-Control': 'no-cache,no-store,must-revalidate', 'X-Request-ID': 'eZAx8wxG', 'Content-Type': 'application/json', 'Content-Length': '368191', 'Vary': 'Origin', 'Via': '1.1 google', 'Alt-Svc': 'h3="":443""; ma=2592000,h3-29="":443""; ma=2592000'})\nHTTP response body: {""message"":""Failed to deserialize value '[\""bio.terra.model.IngestResponseModel\"",{\""dataset_id\"":\""ecd0e3b1-a177-4487-8e33-0084688cf148\"",\""dataset\"":\""ANVIL_CCDG_Broad_CVD_E"


10/23/2024 02:23:02 PM - INFO: Starting Extract and Load (EL) Pipeline for AnVIL_CCDG_Broad_CVD_PROMIS_GRU_WES.
10/23/2024 02:23:02 PM - INFO: Building file inventory.
10/23/2024 02:23:15 PM - INFO: Recording inventory entries from fc-secure-d8de1fe3-972d-480f-a8a8-2bbc251add30 (97744 objects total)
10/23/2024 02:25:49 PM - INFO: 9774 files recorded (~10%)
10/23/2024 02:25:55 PM - INFO: 19548 files recorded (~20%)
10/23/2024 02:26:01 PM - INFO: 29322 files recorded (~30%)
10/23/2024 02:26:08 PM - INFO: 39096 files recorded (~40%)
10/23/2024 02:26:14 PM - INFO: 48870 files recorded (~50%)
10/23/2024 02:26:20 PM - INFO: 58644 files recorded (~60%)
10/23/2024 02:26:27 PM - INFO: 68418 files recorded (~70%)
10/23/2024 02:26:33 PM - INFO: 78192 files recorded (~80%)
10/23/2024 02:26:39 PM - INFO: 87966 files recorded (~90%)
10/23/2024 02:26:46 PM - INFO: 97740 files recorded (~100%)
10/23/2024 02:26:46 PM - INFO: All inventory entries recorded (97744 objects total).
10/23/2024 02:26:46 PM -



10/23/2024 02:27:19 PM - INFO: Processing files for target table: sample_set.
10/23/2024 02:27:23 PM - INFO: Processing files for target table: subject.
10/23/2024 02:27:28 PM - INFO: Processing files for target table: workspace_attributes.
10/23/2024 02:27:31 PM - INFO: Processing files for target table: file_inventory.
10/23/2024 02:27:46 PM - INFO: Creating schema object and copying to cloud storage.
10/23/2024 02:27:50 PM - INFO: File processing complete. Status: Success. Details: {"file_inventory_population": "File inventory populated", "participant": "No errors raised", "sample": "No errors raised", "sample_set": "No errors raised", "subject": "No errors raised", "workspace_attributes": "No errors raised", "file_inventory": "No errors raised"}. Tables to ingest: participant, sample, sample_set, subject, workspace_attributes, file_inventory
10/23/2024 02:27:51 PM - INFO: Attempting to create or retrieve the specified TDR dataset.
10/23/2024 02:27:51 PM - INFO: Creating new dataset

10/23/2024 03:05:07 PM - INFO: Running ingests for target table: workspace_attributes
10/23/2024 03:05:07 PM - INFO: Checking for file ingest_pipeline/output/source/AnVIL_CCDG_Broad_CVD_PROMIS_GRU_WES/table_data/workspace_attributes/workspace_attributes_0.json...
10/23/2024 03:05:07 PM - INFO: Running ingest from workspace_attributes_0.json to table workspace_attributes.
TDR Job ID: U45B1CqxQSabV4-LURDPzA
10/23/2024 03:05:28 PM - INFO: Ingest from file workspace_attributes_0.json succeeded: {'dataset_id': 'b2b217c2-4b68-4820-bf9d-e2927bfe8706', 'dataset': 'ANVIL_CCDG_Broad_CVD_PROMIS_GRU_WES_20241023', 'table': 'workspace_attributes', 'path': 'gs://fc-2a9eefc3-0302-427f-9ac3-82f078741c03/ingest_pipeline/output/source/AnVIL_CCDG_Broad_CVD_PROMIS_GRU_WES/table_data/workspace_attributes/workspace_attributes_0.json', 'load_tag': 'Ingest for AnVIL_CCDG_Broad_CVD_PROMIS_GRU_WES', 'row_count': 42, 'bad_row_count': 0, 'load_result': None}
10/23/2024 03:05:28 PM - INFO: Running ingests for targ

Unnamed: 0,Staging Area,Time,Step,Task,Status,Message
0,AnVIL_CCDG_Broad_CVD_PROMIS_GRU_WES,2024-10-23 14:23:02,Initialization,Provenance File Retrieval,Success,"{""phs_id"": ""phs001569"", ""consent_name"": ""GRU"", ""source_workspaces"": [""AnVIL_CCDG_Broad_CVD_PROMIS_GRU_WES""], ""auth_domains"": [""AUTH_AnVIL_CCDG_Broad_CVD_PROMIS_GRU_WES""], ""data_files_src_buckets"": {""fc-secure-d8de1fe3-972d-480f-a8a8-2bbc251add30"": {""include_dirs"": [], ""exclude_dirs"": []}}, ""data_file_refs"": {""qc_result_sample"": [{""column"": ""cram"", ""method"": ""file_path_match"", ""match_multiple_files"": false, ""match_regex"": null, ""match_type"": ""exact"", ""mode"": ""fileref_in_line"", ""create_new_field"": false, ""new_field_name"": null}], ""sample"": [{""column"": ""crai_path"", ""method"": ""file_path_match"", ""match_multiple_files"": false, ""match_regex"": null, ""match_type"": ""exact"", ""mode"": ""fileref_in_line"", ""create_new_field"": false, ""new_field_name"": null}, {""column"": ""md5_path"", ""method"": ""file_path_match"", ""match_multiple_files"": false, ""match_regex"": null, ""match_type"": ""exact"", ""mode"": ""fileref_in_line"", ""create_new_field"": false, ""new_field_name"": null}, {""column"": ""cram_path"", ""method"": ""file_pa"
1,AnVIL_CCDG_Broad_CVD_PROMIS_GRU_WES,2024-10-23 14:26:54,File Inventory Creation,Build File Inventory,Success,54173 files found
2,AnVIL_CCDG_Broad_CVD_PROMIS_GRU_WES,2024-10-23 14:27:51,Table Data Processing,Ingest Pre-Processing,Success,"{""file_inventory_population"": ""File inventory populated"", ""participant"": ""No errors raised"", ""sample"": ""No errors raised"", ""sample_set"": ""No errors raised"", ""subject"": ""No errors raised"", ""workspace_attributes"": ""No errors raised"", ""file_inventory"": ""No errors raised""}"
3,AnVIL_CCDG_Broad_CVD_PROMIS_GRU_WES,2024-10-23 14:27:51,Dataset Creation or Retrieval,Enumerate Datasets,Success,0 datasets found. Matching dataset_id =
4,AnVIL_CCDG_Broad_CVD_PROMIS_GRU_WES,2024-10-23 14:28:52,Dataset Creation or Retrieval,Create New Dataset,Success,"Job_ID: Qs-xnebgS0-4fqnT2_Sivw - Truncated Response: {'id': 'b2b217c2-4b68-4820-bf9d-e2927bfe8706', 'name': 'ANVIL_CCDG_Broad_CVD_PROMIS_GRU_WES_20241023', 'description': 'TDR Dataset for AnVIL_CCDG_Broad_CVD_PROMIS_GRU_WES', 'defaultProfileId': 'e0e03e48-5b96-45ec-baa4-8cc1ebf74c61', 'createdDate': '2024-10-23T14:28:32.823376Z', 'storage': [{'region': 'us-central1', 'cloudResource': 'bigquery', 'cloudPlatform': 'gcp'}, {'region': 'us-east4', 'cloudResource': 'firestore', 'cloudPlatform': 'gcp'}, {'region': 'us-central1', 'cloudResource': 'bucket', 'cloudPlatform': 'gcp'}], 'secureMonitoringEnabled': True, 'cloudPlatform': 'gcp', 'dataProject': 'datarepo-87d2134c', 'storageAccount': None, 'phsId': 'phs001569', 'selfHosted': True, 'predictableFileIds': True, 'tags': [], 'resourceLocks': {'exclusive': None, 'shared': []}}"
5,AnVIL_CCDG_Broad_CVD_PROMIS_GRU_WES,2024-10-23 14:29:47,Dataset Service Account Setup,Add SA to Anvil Ingest Group,Success,tdr-ingest-sa@datarepo-87d2134c.iam.gserviceaccount.com
6,AnVIL_CCDG_Broad_CVD_PROMIS_GRU_WES,2024-10-23 14:29:57,Dataset Service Account Setup,Add Ingest User to Workspace anvil_workspace_ingest_resources_dev,Success,tdr-ingest-sa@datarepo-87d2134c.iam.gserviceaccount.com
7,AnVIL_CCDG_Broad_CVD_PROMIS_GRU_WES,2024-10-23 14:30:04,Dataset Service Account Setup,Add Ingest User to Workspace AnVIL_CCDG_Broad_CVD_PROMIS_GRU_WES,Success,tdr-ingest-sa@datarepo-87d2134c.iam.gserviceaccount.com
8,AnVIL_CCDG_Broad_CVD_PROMIS_GRU_WES,2024-10-23 14:37:46,Dataset Ingests,Table: participant - File: participant_0.json,Success,"Job_ID: FSFzp5JBQuiLnn5XodqAxg - Truncated Response: {'dataset_id': 'b2b217c2-4b68-4820-bf9d-e2927bfe8706', 'dataset': 'ANVIL_CCDG_Broad_CVD_PROMIS_GRU_WES_20241023', 'table': 'participant', 'path': 'gs://fc-2a9eefc3-0302-427f-9ac3-82f078741c03/ingest_pipeline/output/source/AnVIL_CCDG_Broad_CVD_PROMIS_GRU_WES/table_data/participant/participant_0.json', 'load_tag': 'Ingest for AnVIL_CCDG_Broad_CVD_PROMIS_GRU_WES', 'row_count': 16808, 'bad_row_count': 0, 'load_result': None}"
9,AnVIL_CCDG_Broad_CVD_PROMIS_GRU_WES,2024-10-23 14:51:06,Dataset Ingests,Table: sample - File: sample_0.json,Success,"Job_ID: P85e7vzNSpKvc9G76AFaBw - Truncated Response: Job succeeded, but error retrieving job result: (500)\nReason: Internal Server Error\nHTTP response headers: HTTPHeaderDict({'Date': 'Wed, 23 Oct 2024 14:50:56 GMT', 'Server': 'Apache', 'X-Frame-Options': 'SAMEORIGIN', 'Access-Control-Allow-Headers': 'DNT,User-Agent,X-Requested-With,If-Modified-Since,Cache-Control,Content-Type,Range,Authorization,Accept,Referer,X-App-Id,Origin', 'Access-Control-Allow-Methods': 'GET,POST,DELETE,PUT,PATCH,OPTIONS,HEAD', 'X-Content-Type-Options': 'nosniff', 'Strict-Transport-Security': 'max-age=31536000;includeSubDomains', 'Cache-Control': 'no-cache,no-store,must-revalidate', 'X-Request-ID': 'ekMJGJd6', 'Content-Type': 'application/json', 'Content-Length': '378107', 'Vary': 'Origin', 'Via': '1.1 google', 'Alt-Svc': 'h3="":443""; ma=2592000,h3-29="":443""; ma=2592000'})\nHTTP response body: {""message"":""Failed to deserialize value '[\""bio.terra.model.IngestResponseModel\"",{\""dataset_id\"":\""b2b217c2-4b68-4820-bf9d-e2927bfe8706\"",\""dataset\"":\""ANVIL_CCDG_Broad_CVD_P"


# Mapping Development
Work through the following steps for each dataset that needs to be processed through the transformation pipeline in Step 4, specifying the target schema ("mapping target") and mapping specification ("mapping_target_spec") you would like to use for transformation. Note that you can use the logs or results_dict from the previous step to retrieve the dataset_id values of interest, or retrieve them directly from TDR via the UI or Swagger.

## Dataset Mapping Variables

In [8]:
## >>> Mapping Variables <<<
# For each dataset specified, include an appropriate mapping target and mapping target specification
datasets_to_map_list = [
    #["dataset_id", "mapping_target", "mapping_target_spec", Run (True/False)]
    ['cb7dccc5-171c-48bf-9e5e-07bd6f52b34a', 'anvil', 'depmap_1', True],
    ['ac48514d-0b01-4a92-b164-821fa3e05d7a', 'anvil', 'hudsonalpha_1', True],
]

#-----------------------------------------------------------------------------------------------------------#
    
# Print variables
print("Datasets to map: ")
api_client = utils.refresh_tdr_api_client()
api_client.client_side_validation = False
datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
final_datasets_to_map_dict = {}
skip_dataset_list_access = []
skip_dataset_list_mapping = []
storage_client = storage.Client()
bucket = storage_client.get_bucket(ws_bucket_name)
for dataset in datasets_to_map_list:
    if dataset[3]:
        dataset_id = dataset[0]
        mapping_target = dataset[1]
        mapping_target_spec = dataset[2]
        try:
            dataset_info = datasets_api.retrieve_dataset(id=dataset_id, include=["SCHEMA", "ACCESS_INFORMATION", "PROPERTIES"]).to_dict()
            dataset_name = dataset_info["name"]
        except:
            dataset_name = ""
            skip_dataset_list_access.append(dataset_id)
        try:
            blob = bucket.blob("ingest_pipeline/mapping/{}/mapping_schema_object.json".format(mapping_target))
            content = json.loads(blob.download_as_string(client=None))
            blob = bucket.blob("ingest_pipeline/mapping/{}/{}/mapping_specification.json".format(mapping_target, mapping_target_spec))
            content = json.loads(blob.download_as_string(client=None))
        except:
            skip_dataset_list_mapping.append(dataset_id)
        if dataset_id not in skip_dataset_list_access and dataset_id not in skip_dataset_list_mapping:
            final_datasets_to_map_dict[dataset_id] = {}
            final_datasets_to_map_dict[dataset_id]["mapping_target"] = mapping_target 
            final_datasets_to_map_dict[dataset_id]["mapping_target_spec"] = mapping_target_spec
            print("\t- " + dataset_name + " ({})".format(dataset_id) + " with {}/{}".format(mapping_target, mapping_target_spec))
if skip_dataset_list_access:
    print("Datasets to skip due to non-existence or inaccessibility to the current user:")
    print("\t- " + "\n\t- ".join(skip_dataset_list_access))
if skip_dataset_list_mapping:
    print("Datasets to skip due to invalid mapping target or mapping target specification:")
    print("\t- " + "\n\t- ".join(skip_dataset_list_mapping))   


Datasets to map: 
	- ANVIL_DepMap_HMB_20240827 (cb7dccc5-171c-48bf-9e5e-07bd6f52b34a) with anvil/depmap_1
	- ANVIL_HudsonAlpha_LR_v1_GRU_20241018 (ac48514d-0b01-4a92-b164-821fa3e05d7a) with anvil/hudsonalpha_1


## Add Missing Relationships to TDR Dataset Schema
Relationships are needed by the mapping query constructor to build appropriate joins between tables. If no joins are required between tables, this step is unnecessary. 

In [9]:
# Record relationships to potentially add to the source datasets. Note that there may be more relationships to add
# than those listed below, so add to this list as necessary.
potential_relationships = [
    ["subject.family_id", "family.family_id"],
    ["sample.subject_id", "subject.subject_id"],
    ["sample.t_01_subject_id", "subject.subject_id"],
    ["sequencing.sample_id", "sample.sample_id"],
    ["sequencing.sample", "sample.sample_id"],
    ["sequencing.sample_alias", "sample.sample_id"],
    ["sample.participant", "participant.participant_id"],
    ["sample.participant_id", "participant.participant_id"],
    ["discovery.sample_id", "sample.sample_id"],
    ["discovery.subject_id", "subject.subject_id"],
    ["qc_result_sample.qc_result_sample_id", "sample.sample_id"],
    ["interval.chromosome", "chromosome.chromosome_id"],
    ["analyte.participant_id", "participant.participant_id"],
    ["participant.family_id", "family.family_id"],
    ["phenotype.participant_id", "participant.participant_id"],
    ["experiment_rna_short_read.analyte_id", "analyte.analyte_id"],
    ["experiment_dna_short_read.analyte_id", "analyte.analyte_id"],
    ["aligned_rna_short_read.experiment_rna_short_read_id", "experiment_rna_short_read.experiment_rna_short_read_id"],
    ["aligned_dna_short_read.experiment_dna_short_read_id", "experiment_dna_short_read.experiment_dna_short_read_id"],
    ["aligned_dna_short_read_set.aligned_dna_short_reads", "aligned_dna_short_read.aligned_dna_short_read_id"],
    ["called_variants_dna_short_read.aligned_dna_short_read_set_id", "aligned_dna_short_read_set.aligned_dna_short_read_set_id"],
    ["biosample.donor_id", "donor.donor_id"],
]

# Loop through datasets and process potential relationship additions
results = []
api_client = utils.refresh_tdr_api_client()
datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
for dataset_id in final_datasets_to_map_dict:
    print("Processing potential relationships for dataset_id = {}".format(dataset_id))
    
    # Retrieve source schema
    src_schema_dict = {}
    try:
        datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
        response = datasets_api.retrieve_dataset(id=dataset_id, include=["SCHEMA", "ACCESS_INFORMATION"]).to_dict()
        src_schema_dict["tables"] = response["schema"]["tables"]
        src_schema_dict["relationships"] = response["schema"]["relationships"]
    except Exception as e:
        print("Error retrieving source schema from TDR. Error: {}".format(e))
        results.append([dataset_id, "Error"])

    # Loop through potential relationships and add those present for the source dataset
    additional_relationships = []
    for rel in potential_relationships:
        from_table = rel[0].split(".")[0] 
        from_column = rel[0].split(".")[1]
        to_table = rel[1].split(".")[0]
        to_column = rel[1].split(".")[1]
        if bmq.confirm_column_exists(src_schema_dict, from_table, from_column) and bmq.confirm_column_exists(src_schema_dict, to_table, to_column):
            relationship_found = False
            for rel_entry in src_schema_dict["relationships"]:
                if rel_entry["_from"]["table"] == from_table and rel_entry["_from"]["column"] == from_column and rel_entry["to"]["table"] == to_table and rel_entry["to"]["column"] == to_column:
                    relationship_found = True
                elif rel_entry["_from"]["table"] == to_table and rel_entry["_from"]["column"] == to_column and rel_entry["to"]["table"] == from_table and rel_entry["to"]["column"] == from_column:
                    relationship_found = True
            if not relationship_found:
                rel_dict = {
                    "name": from_table + "_" + from_column + "__to__" + to_table + "_" + to_column,
                    "from": {"table": from_table, "column": from_column},
                    "to": {"table": to_table, "column": to_column}
                }
                additional_relationships.append(rel_dict)

    # Submit the schema update request for the TDR dataset
    if additional_relationships:
        schema_update_request = {
            "description": "Adding relationships to support query construction.",
            "changes": {
                "addRelationships": additional_relationships
            }
        }
        try:
            resp = utils.wait_for_tdr_job(datasets_api.update_schema(id=dataset_id, dataset_schema_update_model=schema_update_request))
            print("Schema update successful: " + str(resp)[0:1000])
            results.append([dataset_id, "Success"])
        except Exception as e:
            print("Error running schema update: " + str(e))
            results.append([dataset_id, "Error"])
    else:
        print("No additional relationships to add to schema.")
        results.append([dataset_id, "Success"])

print("Processing of potential relationships for specified datasets complete.")
print("\nResults:")
results_df = pd.DataFrame(results, columns = ["dataset", "status"])
display(results_df)


Processing potential relationships for dataset_id = cb7dccc5-171c-48bf-9e5e-07bd6f52b34a
TDR Job ID: m2xayXLfQg2MQ3nGsft20g
Schema update successful: ({'id': 'cb7dccc5-171c-48bf-9e5e-07bd6f52b34a', 'name': 'ANVIL_DepMap_HMB_20240827', 'description': 'TDR Dataset for AnVIL_DepMap_HMB', 'defaultProfileId': None, 'dataProject': None, 'defaultSnapshotId': None, 'schema': {'tables': [{'name': 'workspace_attributes', 'columns': [{'name': 'attribute', 'datatype': 'string', 'array_of': False, 'required': False}, {'name': 'value', 'datatype': 'string', 'array_of': False, 'required': False}, {'name': 'ingest_provenance', 'datatype': 'string', 'array_of': False, 'required': False}], 'primaryKey': [], 'partitionMode': 'none', 'datePartitionOptions': None, 'intPartitionOptions': None, 'rowCount': None}, {'name': 'biosample', 'columns': [{'name': 'biosample_id', 'datatype': 'string', 'array_of': False, 'required': False}, {'name': 'cell_format', 'datatype': 'string', 'array_of': False, 'required': F

Unnamed: 0,dataset,status
0,cb7dccc5-171c-48bf-9e5e-07bd6f52b34a,Success
1,ac48514d-0b01-4a92-b164-821fa3e05d7a,Success


## Retrieve Mapping Artifacts and Run Query Construction
Retrieve the artifacts you would like to use to construct transformation queries for your datasets, based on the previously specified target schema and mapping specification. These transformation queries will then be dynamically constructed based on the appropriate target schema, mapping specification, and source schema. 

In [None]:
# Loop through datasets and process transformation query construction
api_client = utils.refresh_tdr_api_client()
datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
results = []
for dataset_id in final_datasets_to_map_dict:
    print("Building transformation queries for dataset_id = {}".format(dataset_id))

    # Collect mapping variables
    mapping_target = final_datasets_to_map_dict[dataset_id]["mapping_target"]
    mapping_target_spec = final_datasets_to_map_dict[dataset_id]["mapping_target_spec"]
    
    # Retrieve source schema
    src_schema_dict = {}
    try:
        datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
        response = datasets_api.retrieve_dataset(id=dataset_id, include=["SCHEMA", "ACCESS_INFORMATION"]).to_dict()
        src_schema_dict["name"] = response["name"]
        src_schema_dict["tables"] = response["schema"]["tables"]
        src_schema_dict["relationships"] = response["schema"]["relationships"]
        bq_project = response["access_information"]["big_query"]["project_id"]
        bq_schema = response["access_information"]["big_query"]["dataset_name"]
        phs_id = response["phs_id"]
    except Exception as e:
        print("Error retrieving source schema from TDR. Error: {}".format(e))

    # Set dataset name and project name parameters to substitute into transform queries
    dataset_name_value = re.sub("(_[0-9]+$)", "", src_schema_dict["name"])
    project_name_value = re.sub("'", "", utils.derive_project_name(dataset_id, phs_id, dataset_name_value))

    # Retrieve target schema and mapping specification
    target_schema_dict = {}
    mapping_spec = {}
    storage_client = storage.Client()
    bucket = storage_client.get_bucket(ws_bucket_name)
    try:
        blob = bucket.blob(f"ingest_pipeline/mapping/{mapping_target}/mapping_schema_object.json")
        target_schema_dict = json.loads(blob.download_as_string(client=None))
    except Exception as e:
        print("Error retrieving target schema for specified mapping_target. Error: {}".format(e))
    try:
        blob = bucket.blob(f"ingest_pipeline/mapping/{mapping_target}/{mapping_target_spec}/mapping_specification.json")
        blob_string = blob.download_as_text(client=None)
        blob_string = blob_string.replace("$DATASET_NAME", dataset_name_value)
        blob_string = blob_string.replace("$PROJECT_NAME", project_name_value)
        blob_string = blob_string.replace("$BQ_DATASET", bq_project + "." + bq_schema)
        mapping_spec = json.loads(blob_string)
    except Exception as e:
        print("Error retrieving mapping specification for specified mapping_target and mapping_target_spec. Error: {}".format(e))

    # Update aliases in mapping specification
    mapping_spec = bmq.update_mapping_spec_aliases(mapping_spec, src_schema_dict)
    
    # Build queries from mapping specification
    query_dict = {}
    if target_schema_dict:
        for target_table in target_schema_dict["tables"]:
            table_name = target_table["name"]
            missing_artifacts = False
            if src_schema_dict and mapping_spec:
                query_dict[table_name] = bmq.build_mapping_query(target_table, src_schema_dict, mapping_spec, bq_project, bq_schema)
            else:
                missing_artifacts = True
                query_dict[table_name] = {"query": "", "syntax_check": ""} 
        if missing_artifacts == True:
            print("Source schema dictionary and/or mapping specification missing. Unable to generate queries.")
            results.append([dataset_id, "Error"])
    else:
        print("Target schema dictionary missing. Unable to generate queries.")
        results.append([dataset_id, "Error"])
    
    # Evaluate queries -- Publish if no issues found, otherwise convert to dataframe and display
    failure_count = 0
    for key, val in query_dict.items():
        if val["syntax_check"] != "Passed" and val["syntax_check"] != None:
            failure_count += 1
    if failure_count == 0:
        print("No failures found in query construction, publishing to the cloud.")
        results.append([dataset_id, "Success"])
        # Copy target schema file to output folder for mapping target
        source_path = "ingest_pipeline/mapping/{}/mapping_schema_object.json".format(mapping_target)
        destination_path = "ingest_pipeline/output/transformed/{}/{}/schema/mapping_schema_object.json".format(mapping_target, dataset_id)
        !gsutil cp $ws_bucket/$source_path $ws_bucket/$destination_path 2> stdout

        # Limit query dict to valid queries, write out, and copy to output folder for mapping target
        valid_query_dict = {}
        for target, val in query_dict.items():
            if val["syntax_check"] == "Passed":
                valid_query_dict[target] = val
        final_query_dict = {
            "dataset_id": dataset_id,
            "transforms": valid_query_dict
        }
        query_dict_json = json.dumps(final_query_dict)
        query_output_file = "transform_query_set.json"
        with open(query_output_file, 'w') as outfile:
            outfile.write(query_dict_json)
        destination_path = "ingest_pipeline/output/transformed/{}/{}/queries".format(mapping_target, dataset_id)
        !gsutil cp $query_output_file $ws_bucket/$destination_path/ 2> stdout
    else:
        print("Failures found in query construction, must be resolved before publishing.")
        print("Query building results:")
        results.append([dataset_id, "Error"])
        query_df = pd.DataFrame.from_dict(query_dict, orient="index")
        query_df.index.name = "target_table"
        query_df.reset_index(inplace=True)
        display(query_df)

print("Transformation query construction and processing complete.")
print("\nResults:")
results_df = pd.DataFrame(results, columns = ["dataset", "status"])
display(results_df)


## Evaluate Vocabulary Mapping
For target attributes leveraging the "VOCAB_MAP" transformation, evaluate whether the source values have a record in the dsp-data-ingest.transform_resources.vocab_map table. If additional mappings are needed, these should be put into place before the transformation queries are executed.

In [None]:
# Set display parameter
show_only_missing_maps = True

# Loop through datasets and process vocabulary mapping evaluation
api_client = utils.refresh_tdr_api_client()
datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
for dataset_id in final_datasets_to_map_dict:
    print("Evaluating vocabulary mapping for dataset_id = {}".format(dataset_id))

    # Collect mapping variables
    mapping_target = final_datasets_to_map_dict[dataset_id]["mapping_target"]
    mapping_target_spec = final_datasets_to_map_dict[dataset_id]["mapping_target_spec"]
    
    # Retrieve source schema
    src_schema_dict = {}
    try:
        datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
        response = datasets_api.retrieve_dataset(id=dataset_id, include=["SCHEMA", "ACCESS_INFORMATION"]).to_dict()
        src_schema_dict["name"] = response["name"]
        src_schema_dict["tables"] = response["schema"]["tables"]
        src_schema_dict["relationships"] = response["schema"]["relationships"]
        bq_project = response["access_information"]["big_query"]["project_id"]
        bq_schema = response["access_information"]["big_query"]["dataset_name"]
        phs_id = response["phs_id"]
    except Exception as e:
        print("Error retrieving source schema from TDR. Error: {}".format(e))

    # Set dataset name and project name parameters to substitute into transform queries
    dataset_name_value = re.sub("(_[0-9]+$)", "", src_schema_dict["name"])
    project_name_value = utils.derive_project_name(dataset_id, phs_id, dataset_name_value)

    # Retrieve target schema and mapping specification
    target_schema_dict = {}
    mapping_spec = {}
    storage_client = storage.Client()
    bucket = storage_client.get_bucket(ws_bucket_name)
    try:
        blob = bucket.blob(f"ingest_pipeline/mapping/{mapping_target}/mapping_schema_object.json")
        target_schema_dict = json.loads(blob.download_as_string(client=None))
    except Exception as e:
        print("Error retrieving target schema for specified mapping_target. Error: {}".format(e))
    try:
        blob = bucket.blob(f"ingest_pipeline/mapping/{mapping_target}/{mapping_target_spec}/mapping_specification.json")
        blob_string = blob.download_as_text(client=None)
        blob_string = blob_string.replace("$DATASET_NAME", dataset_name_value)
        blob_string = blob_string.replace("$PROJECT_NAME", project_name_value)
        mapping_spec = json.loads(blob_string)
    except Exception as e:
        print("Error retrieving mapping specification for specified mapping_target and mapping_target_spec. Error: {}".format(e))

    # Evaluate vocab mapping and display results
    df = bmq.evaluate_vocab_mapping(mapping_spec, src_schema_dict, target_schema_dict, bq_project, bq_schema)
    print("-------------------------------------------")
    print("Missing mapped_value view:")
    print("-------------------------------------------")
    display(df[df["mapped_value"].isnull() & df["source_value"].notnull()])
    if not show_only_missing_maps:
        print("\n-------------------------------------------")
        print("Full view:")
        print("-------------------------------------------")
        display(df)
    
print("Vocabulary mapping evaluation and processing complete.")


## [Optional] Update/Override Generated Queries as Necessary
Review any queries that have not passed the syntax check, as these need to be remedied before they can be published and executed. Any other queries that do not align with expectations can be overridden by either A) Updating the mapping target specification and re-running the previous step, or B) Manually overriding the query below. Option B should only be used in one-off cases.

### Build Base Query Dictionary

In [None]:
# Input the appropriate dataset and mapping target specification
dataset_id = "f1e1ef01-d52d-423e-a65b-3a1d26c7ee9d"
mapping_target = "anvil"
mapping_target_spec = "cmg_ext_2"

# Retrieve source schema
src_schema_dict = {}
api_client = utils.refresh_tdr_api_client()
datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
try:
    datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
    response = datasets_api.retrieve_dataset(id=dataset_id, include=["SCHEMA", "ACCESS_INFORMATION"]).to_dict()
    src_schema_dict["name"] = response["name"]
    src_schema_dict["tables"] = response["schema"]["tables"]
    src_schema_dict["relationships"] = response["schema"]["relationships"]
    bq_project = response["access_information"]["big_query"]["project_id"]
    bq_schema = response["access_information"]["big_query"]["dataset_name"]
    phs_id = response["phs_id"]
except Exception as e:
    print("Error retrieving source schema from TDR. Error: {}".format(e))

# Set dataset name and project name parameters to substitute into transform queries
dataset_name_value = re.sub("(_[0-9]+$)", "", src_schema_dict["name"])
project_name_value = utils.derive_project_name(dataset_id, phs_id, dataset_name_value)

# Retrieve target schema and mapping specification
target_schema_dict = {}
mapping_spec = {}
storage_client = storage.Client()
bucket = storage_client.get_bucket(ws_bucket_name)
try:
    blob = bucket.blob(f"ingest_pipeline/mapping/{mapping_target}/mapping_schema_object.json")
    target_schema_dict = json.loads(blob.download_as_string(client=None))
except Exception as e:
    print("Error retrieving target schema for specified mapping_target. Error: {}".format(e))
try:
    blob = bucket.blob(f"ingest_pipeline/mapping/{mapping_target}/{mapping_target_spec}/mapping_specification.json")
    blob_string = blob.download_as_text(client=None)
    blob_string = blob_string.replace("$DATASET_NAME", dataset_name_value)
    blob_string = blob_string.replace("$PROJECT_NAME", project_name_value)
    mapping_spec = json.loads(blob_string)
except Exception as e:
    print("Error retrieving mapping specification for specified mapping_target and mapping_target_spec. Error: {}".format(e))

# Build queries from mapping specification
query_dict = {}
if target_schema_dict:
    for target_table in target_schema_dict["tables"]:
        table_name = target_table["name"]
        missing_artifacts = False
        if src_schema_dict and mapping_spec:
            query_dict[table_name] = bmq.build_mapping_query(target_table, src_schema_dict, mapping_spec, bq_project, bq_schema)
        else:
            missing_artifacts = True
            query_dict[table_name] = {"query": "", "syntax_check": ""} 
    if missing_artifacts == True:
        print("Source schema dictionary and/or mapping specification missing. Unable to generate queries.")
else:
    print("Target schema dictionary missing. Unable to generate queries.")
    
# Display query dictionary
query_df = pd.DataFrame.from_dict(query_dict, orient="index")
query_df.index.name = "target_table"
query_df.reset_index(inplace=True)
display(query_df)
    


### Update Query Dict as Necessary

In [None]:
# To update the query definition for particular target table, input the target table and query below
target_table = "anvil_donor"
query = "SELECT 1"

# Run syntax check
query_dict[target_table]["query"] = query
query_dict[target_table]["syntax_check"] = bmq.run_syntax_check(query)
print(query_dict[target_table])


### Publish Updated Query Dict

In [None]:
# Copy target schema file to output folder for mapping target
source_path = "ingest_pipeline/mapping/{}/mapping_schema_object.json".format(mapping_target)
destination_path = "ingest_pipeline/output/transformed/{}/{}/schema/mapping_schema_object.json".format(mapping_target, dataset_id)
!gsutil cp $ws_bucket/$source_path $ws_bucket/$destination_path 2> stdout

# Limit query dict to valid queries, write out, and copy to output folder for mapping target
valid_query_dict = {}
for target, val in query_dict.items():
    if val["syntax_check"] == "Passed":
        valid_query_dict[target] = val
final_query_dict = {
    "dataset_id": dataset_id,
    "transforms": valid_query_dict
}
query_dict_json = json.dumps(final_query_dict)
query_output_file = "transform_query_set.json"
with open(query_output_file, 'w') as outfile:
    outfile.write(query_dict_json)
destination_path = "ingest_pipeline/output/transformed/{}/{}/queries".format(mapping_target, dataset_id)
!gsutil cp $query_output_file $ws_bucket/$destination_path/ 2> stdout

# "T" Pipeline: Load Additional Transformed Tables to TDR

## Pipeline Run Variables

In [None]:
# Run Variables
dataset_id_run_list = [
    #["dataset_id", Run (True/False)],   
    ['8da05494-fe7a-4af5-b257-bada143ee426', True],
    ['8b098ab4-df02-4619-8ded-657e496695c1', True],
    ['373ff2e8-0f63-4179-a55c-3fe0b85556aa', True],
    ['31e61d00-61cc-46f2-a793-8ea8dfbb0832', True],
]
params["mapping_target"] = "anvil"
params["skip_transforms"] = False
params["transform_list_override"] = [] # Leave empty to run transforms for all files, otherwise populate with target table names 
params["skip_schema_extension"] = False
params["skip_ingests"] = False
params["trunc_before_ingest"] = True
params["ingest_list_override"] = [] # Leave empty to run ingests for all files, otherwise populate with target table names
params["skip_file_relation_inference"] = False
params["skip_dangling_fk_resolution"] = False
params["skip_supplementary_file_identification"] = False
params["skip_snapshot_creation"] = False
params["snapshot_readers_list"] = ["azul-anvil-prod@firecloud.org", "auth-domain"] # Include "auth-domain" to add the auth domain(s) as a reader (if one exists)
params["skip_data_validation"] = False

#-----------------------------------------------------------------------------------------------------------#

# Print variables
print("Pipeline run variables set:")
print("Profile ID: " + params["profile_id"])
print("Mapping Target: " + params["mapping_target"])
print("Datasets to run: ")
api_client = utils.refresh_tdr_api_client()
datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
dataset_skip_list = []
for dataset in dataset_id_run_list:
    if dataset[1]:
        dataset_id = dataset[0]
        try:
            dataset_info = datasets_api.retrieve_dataset(id=dataset_id, include=["SCHEMA", "ACCESS_INFORMATION", "PROPERTIES"]).to_dict()
            dataset_name = dataset_info["name"]
            phs_id = dataset_info["phs_id"]
            consent_name = dataset_info["properties"]["consent_name"]
            auth_domains = dataset_info["properties"]["auth_domains"]
            src_workspaces = dataset_info["properties"]["source_workspaces"]
        except:
            dataset_name = ""
            dataset_skip_list.append(dataset_id)
        if dataset_name:
            dataset_id = dataset[0]
            print("- " + dataset_name + " ({})".format(dataset_id))
            print("\t- PHS ID = " + phs_id)
            print("\t- Consent Short Name = " + consent_name)
            print("\t- Auth Domains = " + str(auth_domains))
            print("\t- Source Workspaces = " + str(src_workspaces))
if dataset_skip_list:
    print("Datasets to skip (they either don't exist or aren't accessible to the current user): ")
    print("\t- " + "\n\t- ".join(dataset_skip_list)) 
print("Skip transforms? " + str(params["skip_transforms"]))
print("Transforms override list: " + str(params["transform_list_override"]))
print("Skip schema extension? " + str(params["skip_schema_extension"]))
print("Skip ingests? " + str(params["skip_ingests"]))
print("Truncate tables before ingest? " + str(params["trunc_before_ingest"]))
print("Ingest override list: " + str(params["ingest_list_override"]))
print("Skip file relationship inference? " + str(params["skip_file_relation_inference"]))
print("Skip dangling foreign key resolution? " + str(params["skip_dangling_fk_resolution"]))
print("Skip supplementary file identification? " + str(params["skip_supplementary_file_identification"]))
print("Skip snapshot creation? " + str(params["skip_snapshot_creation"]))
print("Skip data validation? " + str(params["skip_data_validation"]))


## Pipeline Execution

In [None]:
# Loop through and execute pipeline for listed workspaces
for dataset in dataset_id_run_list:
    if dataset[1]:
        dataset_id = dataset[0]
        try:
            api_client = utils.refresh_tdr_api_client()
            datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
            dataset_info = datasets_api.retrieve_dataset(id=dataset_id, include=["SCHEMA", "ACCESS_INFORMATION", "PROPERTIES"]).to_dict()
            dataset_name = dataset_info["name"]
            phs_id = dataset_info["phs_id"]
            consent_name = dataset_info["properties"]["consent_name"]
            auth_domains = dataset_info["properties"]["auth_domains"]
            src_workspaces = dataset_info["properties"]["source_workspaces"]
        except:
            dataset_name = ""
        if dataset_name:
            params["dataset_id"] = dataset_id
            params["dataset_name"] = dataset_name
            params["phs_id"] = phs_id
            params["consent_name"] = consent_name
            params["auth_domains"] = auth_domains
            utils.run_t_pipeline(params)
        

# Utility Scripts
Uncomment sections as necessary to accomplish various miscellaneous tasks.

## Collect AnVIL Snapshots and Datasets

In [None]:
# Dataset_ID Filter
dataset_id_list = [
    'b12fb9be-2ce0-4bfd-8503-732fabba06ab',
    'd48adc59-8934-41bb-9720-63e71f1933be'
]

# Collect Anvil datasets and snapshots
current_datetime_string = datetime.datetime.now().strftime("%Y%m%d %H:%M:%S")
logging.info(f"Start time: {current_datetime_string}")
api_client = utils.refresh_tdr_api_client()
datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
snapshots_api = data_repo_client.SnapshotsApi(api_client=api_client)
datasets_list = datasets_api.enumerate_datasets(filter="anvil", limit=2000)
if dataset_id_list:
    dataset_list_len = min(len(datasets_list.items), len(dataset_id_list))
else:
    dataset_list_len = len(datasets_list.items)
records_list = []
dataset_count = 0
for dataset_entry in datasets_list.items:
    if len(dataset_id_list) == 0 or dataset_entry.id in dataset_id_list:
        dataset_count += 1
        logging.info(f"Processing dataset {dataset_count} of {dataset_list_len}")
        if re.match("^ANVIL_[a-zA-Z0-9-_]+_[0-9]{8}", dataset_entry.name.upper()):
            dataset_detail = datasets_api.retrieve_dataset(id=dataset_entry.id, include=["PROPERTIES", "DATA_PROJECT"])
            snapshots_list = snapshots_api.enumerate_snapshots(dataset_ids=[dataset_entry.id], limit=1000)
            try:
                source_workspace = ", ".join(dataset_detail.properties["source_workspaces"])
            except:
                source_workspace = ""
            if len(snapshots_list.items) == 0:
                record = [None, None, None, None, None, None, None, None, None, dataset_entry.id, dataset_entry.name, dataset_detail.ingest_service_account, dataset_entry.created_date[0:10], dataset_entry.cloud_platform, dataset_entry.secure_monitoring_enabled, source_workspace]
                records_list.append(record)
            else:
                snapshot_list_len = len(snapshots_list.items)
                snapshot_count = 0
                for snapshot_entry in snapshots_list.items:
                    snapshot_count += 1
                    logging.info(f"Processing snapshot {snapshot_count} of {snapshot_list_len} for dataset {dataset_count}")
                    # Get public policy information
                    creds, project = google.auth.default()
                    auth_req = google.auth.transport.requests.Request()
                    creds.refresh(auth_req)
                    public_flag = "N"
                    public_response = requests.get(
                        url=f"https://sam.dsde-prod.broadinstitute.org/api/resources/v2/datasnapshot/{snapshot_entry.id}/policies/reader/public",
                        headers={"Authorization": f"Bearer {creds.token}"},
                    )
                    if public_response.text == "true":
                        public_flag = "Y"
                    # Get snapshot DUOS ID and Lock status
                    api_client = utils.refresh_tdr_api_client()
                    snapshots_api = data_repo_client.SnapshotsApi(api_client=api_client)
                    snapshot_detail = snapshots_api.retrieve_snapshot(id=snapshot_entry.id, include=["DUOS"])
                    duos_id = ""
                    if snapshot_detail.duos_firecloud_group:
                        duos_id = snapshot_detail.duos_firecloud_group.duos_id
                    lock_name = snapshot_detail.resource_locks.exclusive
                    if lock_name:
                        lock_status = True
                    else:
                        lock_status = False
                    # Get snapshot readers and auth domain
                    snapshot_policy_response = snapshots_api.retrieve_snapshot_policies(id=snapshot_entry.id)
                    for role in snapshot_policy_response.policies:
                        if role.name == "reader":
                            readers = ", ".join(role.members)
                    ad_groups = ""
                    if snapshot_policy_response.auth_domain:
                        ad_groups = ", ".join(snapshot_policy_response.auth_domain)
                    record = [snapshot_entry.id, snapshot_entry.name, snapshot_entry.created_date[0:10], public_flag, readers, ad_groups, duos_id, snapshot_entry.data_project, lock_status, dataset_entry.id, dataset_entry.name, dataset_detail.ingest_service_account, dataset_entry.created_date[0:10], dataset_entry.cloud_platform, dataset_entry.secure_monitoring_enabled, source_workspace]
                    records_list.append(record)
df = pd.DataFrame(records_list, columns =["Snapshot ID", "Snapshot Name", "Snapshot Created Date", "Snapshot Public", "Snapshot Readers", "Snapshot Auth Domain", "Snapshot DUOS ID", "Snapshot Data Project", "Snapshot Locked", "Source Dataset ID", "Source Dataset Name", "Source Dataset SA", "Source Dataset Created Date", "Cloud Platform", "Secure Monitoring", "Source Workspace"])
df_sorted = df.sort_values(["Source Workspace", "Source Dataset Name", "Snapshot Name"], ascending=[True, True, True], ignore_index=True)
current_datetime_string = datetime.datetime.now().strftime("%Y%m%d %H:%M:%S")
logging.info(f"End time: {current_datetime_string}")
display(df_sorted)


## Soft Deletion of TDR Dataset Records

In [7]:
# Input parameters
dataset_id_list = [
    '04a874df-c57b-40fc-9139-bc3a05129115',
]
#table_list = ["anvil_activity", "anvil_alignmentactivity", "anvil_antibody", "anvil_assayactivity", "anvil_biosample", "anvil_dataset", "anvil_diagnosis", "anvil_donor", "anvil_file", "anvil_project", "anvil_sequencingactivity", "anvil_variantcallingactivity"]
#table_list = ["file_inventory", "sample", "subject", "workspace_attributes", "sequencing", "qc_result_sample", "family", "chromosome", "interval", "participant", "discovery", "sample_set", "vcf"]
table_list = ["file_inventory"]
delete_all_records = False
delete_record_list = [
    '8acf1aab-5b13-43be-8e8f-6def3d694291',
    'a34cfccb-bd7d-4486-8eb7-7e207615f90c',
    '524dfe4b-fa62-4505-afa6-3bdb89766602',
    'f04a39d2-ed93-40cd-96bb-7e80dd10e388',
    '349c520e-693c-4553-9a64-fb6d1a3661e4',
    '21ac6979-0128-4c5b-a638-293dfaf41012',
    'b4e151db-b145-4e33-b547-449755c21849',
    '415a3143-875a-4897-afec-5806072f6123',
    '257965d1-c7bb-4126-b81f-3638da6cafea',
    '8ccfbb24-98c7-48ad-b05b-eabfb2c291b7',
    'be955e44-f676-4d37-ac5f-d54daaf047cc',
    'f9a89223-9f35-4f2e-8d57-44962092871e',
    'd802fd35-a3ac-4eb7-99cd-9ca2f1603460',
    '1e8a67f6-3b2b-443d-908a-495c750fcaa6',
    '80cb6afa-e7e3-4e63-937b-f92ad06eb23c',
    '3f185a9e-ea58-44c9-b5b0-1b1d1f9d82ab',
    '40a8d9ef-c314-4ce3-a442-abc47e12ae36',
    'ce8e0b34-7398-4ff0-a10b-f33e25d3c5ab',
    'd5f3468e-d796-40ad-b92c-c22ca37e0144',
    '67672f40-d0be-4f3a-af19-bcb9041559c0',
    '41d7b873-b627-49d1-80a6-a556470a44eb',
    '3d8ba316-b069-4576-ad0b-1d9eb185a704',
    '272d8bff-4daa-4753-b6f3-820cf21cc8ab',
    '2758ffad-a34e-4039-afd1-2dd93f54a87f',
    '366ada53-eded-4f96-a4eb-b6c7c1e8566c',
    '12604f35-16b1-4ad6-bdf4-e0f748c938ae',
    'a7931170-3b4a-4ce3-9c18-68c163da2798',
    'b64a59e2-3d66-4b2b-bd07-cb1235940602',
    '055c51c4-4125-4ecf-a955-25ed8e658b6a',
    'f6869819-7e60-436d-bd38-46b8b82b1faa',
    '6143267e-6a43-45e8-8c4d-d9764f4fb588',
    '4ca652e3-37aa-4a02-a0b9-9463db8dcaaf',
    '08f940a3-567c-4087-a2dd-183a68f34025',
    'e84c3162-5be9-47cc-92ee-99556e0566d1',
    'a427caf2-1549-4b09-bd08-1c319e82a361',
    'a4eef201-efea-4cac-aeee-d9930f585b64',
    'abaed295-730b-4eca-95bd-2afa326ac009',
    '9eb74281-aa5f-433f-b328-9edbe45dee57',
    '9ea685bc-bfeb-4d55-b0d0-96e1442311e1',
    '00f86ebf-2c09-46fc-b4e7-ccc100fc4428',
    '316599df-717a-493a-a582-75dfac72700d',
    'b439ead0-e8df-4342-a165-df78d40b3bf2',
    'fac47df5-3e08-497d-88a1-10cb134c862c',
    '3b5d82d5-4fa2-45b6-9a34-6ab936bc9cb3',
    '173a3141-f876-412b-a11b-373fdd11c2aa',
    '947a23a1-3087-469b-a623-d95ea55bf4ed',
    '8a8d2067-b92a-46b7-81fe-f0c5548b9b31',
    'e685c321-c18e-4a1e-adb8-26eabb648a47',
    'da7fec1e-a3bb-45f5-aee0-e3f773e627df',
    'ddd72800-a109-419c-bd13-bea1c4886a75',
    '945338cd-5500-4764-ad4a-d5a0cc7bac90',
    '9d50888d-9e99-4b9e-87db-af6063655c7e',
    '8c01cba3-8a72-43d3-9533-d207dc5070d6',
    '3aad620b-dce2-429b-b520-f69d3bf62b95',
    '8e9a2cc6-567d-443b-ba0d-3557f202a0d0',
    '9b52fe8f-ee7a-4907-961f-813454fef8bc',
    '76834b24-b370-4ae4-8e1a-db9526237f7f',
    '1aa37616-83ab-434d-938d-d36f6446d65b',
    '923e1bc9-997d-46b9-a34a-ea83b8742d03',
    'ab15eac9-5cf6-4d28-a4b4-6ce8c415df88',
    '70c3b527-8bda-4504-9727-1c002021e9cd',
    '9d3793bd-ed16-48bf-b8e8-e875e0059e08',
    'ebe5b83b-f6f4-466c-a69a-63a55bdec00c',
    '5b0019e0-54b8-481d-a569-31351e406c4e',
    '991f34c5-676b-4e55-8a52-31a6eecb41d9',
    '888d1f83-7b25-405b-ae14-ca74f1d0034c',
    '260eb1e4-6ec4-4ef9-be8a-f4547914957e',
    '5c9ee153-bb52-491a-9935-d484d07d8003',
    '45ec79a3-3d88-47d7-8c92-fee0f0be33a2',
    '68ce92a9-671d-45a1-a982-6062b711d602',
    'ece23424-47f1-4a9b-9bfb-13194892566e',
    '873d9dba-afa1-498d-b0d6-fe583010cd22',
    '1b3bf439-0a21-47f3-9a5b-9608249c0c24',
    '520ad874-da93-4291-8be3-345c92ce6b1d',
    'e748be16-9050-45c9-a05a-4c83e96ea109',
    '646b4c50-21bd-42f2-b796-5e468a1e2482',
    '09fca946-881a-4289-b1d4-39c2035ccb73',
    '0b76531d-e69b-4fdd-a1d7-7821a1ec1920',
    'f3462a40-e210-4ad9-9c57-458c71669fea',
    'be9e2ddc-9f44-41fa-8c4a-afdfb0427868',
    '66e910aa-0b64-4367-8f47-5daf59ab3bd9',
    '83933015-a449-4dab-a00c-60cdc027850c',
    'e1e5e341-8f23-4316-86cb-2f6699b33c8f',
    'e83d41e7-b258-41e9-a072-3b1869edcd44',
    '42d07a25-1320-47a8-9f82-9a54aa8ccdc8',
    '901e4cc0-397a-4ce8-aa90-dc3a7ccf8012',
    'dc0affc6-7162-4778-a18e-1b7f5e9fd6ce',
    '8a61f4f7-18f6-4410-8c57-2ead5e23bd25',
    'c61a2190-d18c-4b15-b3c8-c9d8b443e8d1',
    '271234a1-832b-46b5-a648-f9b07fe0f600',
    '83b54e82-5c5a-4006-b524-e63e7bb15f01',
    '4ee2f1f8-6bea-47e4-8229-36e78ac23e6f',
    'ed541762-a443-4fb3-88fe-89f936b58e49',
    '7e4ce661-9bcf-4cf1-9d44-c95cec0ce0fa',
    '9c772f97-d17d-44e1-af96-315d9bfd4d45',
    '119d215e-5e85-4ac9-81d8-efc507dbb262',
    'e438d337-5657-4e65-a89e-2d85c30b7931',
    '649e50f2-2ec7-4abc-a15d-19b0c938bfaf',
    'd1acf55b-6133-48b7-b669-80e758b218e1',
    '43d4349e-c114-422a-ae0a-bd6be74d8457',
    'd41c3efe-659f-4568-87fc-d366546f5386',
    'ff3e1801-8b78-4e3e-8722-64d791af4009',
    'a0326832-af55-4591-b7cf-71767e094232',
    'b436c85c-f1d8-4fce-96b5-0f765cd882bc',
    '14d8a1b4-c2e1-43c3-9941-6d3a6ef81a92',
    'd30b3c83-5a13-4e34-aea5-f7f2b8579c82',
    'caa2e92d-c366-4a88-b1ea-ac57b391828e',
    'afe8260c-697a-487d-9b71-360ed04c744e',
    '2dfeaea5-3caf-48ca-b610-3f8f5422a5f6',
    '57185c6e-8d25-49cb-9871-56ff564a4917',
    'b57c59f9-8dc7-4092-9e3f-aa7c6e033a51',
    '5ab6cc40-620d-4a96-9cf2-9eb2047e16f4',
    '02f27d6f-0cb8-4307-b06b-ad545a4acca4',
    '157b85f3-7bcb-47b0-8e34-1453c2d3ae48',
    'ae2a67f8-e082-49a9-92ff-6317605e04a1',
] # Will be ignored if delete_all_records is set to True

#--------------------------------------------------------------------------------------------------------

# Function to delete rows from a dataset
def delete_datarepo_rows(dataset_id, table_name, datarepo_row_ids):
    print("Attempting to delete specified rows from {} for dataset {}".format(table_name, dataset_id))
    if datarepo_row_ids:
        data_deletion_payload = {
            "deleteType": "soft",
            "specType": "jsonArray",
            "tables": [{
              "tableName": table_name,
              "jsonArraySpec": {
                "rowIds": datarepo_row_ids
              }
            }]
        }
        try:
            api_client = utils.refresh_tdr_api_client()
            datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
            data_deletion_result, job_id = utils.wait_for_tdr_job(datasets_api.apply_dataset_data_deletion(id=dataset_id, data_deletion_request=data_deletion_payload))
            print("Result: {}".format(data_deletion_result))
        except Exception as e:
            print("Error: {}".format(str(e)))
    else:
        print("No datarepo_row_ids specified for deletion.")

# Function to collect all datarepo rows for a particular table within a dataset
def collect_all_datarepo_rows(dataset_id, table_name):
    try:
        api_client = utils.refresh_tdr_api_client()
        datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
        response = datasets_api.retrieve_dataset(id=dataset_id, include=["ACCESS_INFORMATION"]).to_dict()
        if response["access_information"]["big_query"]: 
            cloud = "gcp"
            bq_project = response["access_information"]["big_query"]["project_id"]
            bq_schema = response["access_information"]["big_query"]["dataset_name"]
        else:
            cloud = "azure"
            for parquet_table in response["access_information"]["parquet"]["tables"]:
                if parquet_table["name"] == table_name:
                    sas_url = parquet_table["url"] + "?" + parquet_table["sas_token"]
                    break
    except Exception as e:
        print("Error retrieving dataset information: {}".format(str(e)))
    if cloud == "gcp":
        client = bigquery.Client()
        query = "SELECT datarepo_row_id FROM `{project}.{schema}.{table}`".format(project = bq_project, schema = bq_schema, table = table_name)
        try:
            query_job = client.query(query)
            results = [row["datarepo_row_id"] for row in query_job]
            return results
        except Exception as e:
            print("Error retrieving datarepo_row_id list: {}".format(str(e)))
    else:
#         blob_client = BlobClient.from_blob_url(sas_url)
#         downloaded_blob = blob_client.download_blob()
#         bytes_io = BytesIO(downloaded_blob.readall())
#         df_blob = pd.read_parquet(bytes_io)
        retrieval_error = False
        max_page_size = 1000
        records_fetched = 0 
        total_record_count = 1
        results = []
        while records_fetched < total_record_count and not retrieval_error:
            row_start = records_fetched
            attempt_counter = 0
            while True:
                payload = {
                  "offset": row_start,
                  "limit": max_page_size,
                  "sort": "datarepo_row_id",
                  "direction": "asc",
                  "filter": ""
                }
                try:
                    dataset_results = datasets_api.query_dataset_data_by_id(id=dataset_id, table=table_name, query_data_request_model=payload).to_dict() 
                    total_record_count = dataset_results["total_row_count"]
                    for record in dataset_results["result"]:
                        results.append(record["datarepo_row_id"])
                        records_fetched += 1
                    break
                except Exception as e:
                    if attempt_counter < 5:
                        sleep(10)
                        attempt_counter += 1
                        continue
                    else:
                        warn_str = "Error retrieving data_repo_row_ids for table."
                        logging.warning(warn_str)
                        retrieval_error = True
                        break
        return results
    
# Function to loop through datasets and delete
def execute_deletions(dataset_id_list, table_list, delete_all_records, delete_record_list):
    for dataset_id in dataset_id_list:
        print(f"Processing record deletions for dataset {dataset_id}")
        for table in table_list:
            print(f"Processing record deletion for {table}")
            if delete_all_records:
                datarepo_row_ids = collect_all_datarepo_rows(dataset_id, table)
            else:
                datarepo_row_ids = delete_record_list
            if datarepo_row_ids:
                delete_datarepo_rows(dataset_id, table, datarepo_row_ids)
            else:
                print("No records specified for deletion.")
                
#--------------------------------------------------------------------------------------------------------

execute_deletions(dataset_id_list, table_list, delete_all_records, delete_record_list)              



Processing record deletions for dataset 04a874df-c57b-40fc-9139-bc3a05129115
Processing record deletion for file_inventory
Attempting to delete specified rows from file_inventory for dataset 04a874df-c57b-40fc-9139-bc3a05129115
TDR Job ID: oBqszfLPShyzkOv-tFt3MA
Result: {'objectState': 'deleted'}


## Lock/Unlock Snapshots

In [None]:
#############################################
## Functions
#############################################

def update_snapshot_lock_status(snapshot_action, snapshot_id_list):
    results = []
    # Validate snapshot action
    print(f"Validating provided snapshot action: {snapshot_action}")
    if snapshot_action not in ["LOCK", "UNLOCK"]:
        results.append(["ALL", snapshot_action, "Failure", "Invalid snapshot action specified. Must be LOCK or UNLOCK."])
    else:
        # Loop through and process snapshots
        act = snapshot_action.lower()
        for snapshot_id in snapshot_id_list:

            # Initialize
            print(f"Updating snapshot lock status for snapshot: {snapshot_id}.")
            error_str = ""
            api_client = utils.refresh_tdr_api_client()
            snapshots_api = data_repo_client.SnapshotsApi(api_client=api_client)

            # Update snapshot lock status
            if act == "lock":
                # Lock snapshot
                try:
                    response = snapshots_api.lock_snapshot(id=snapshot_id)
                    results.append([snapshot_id, snapshot_action, "Success", None])
                except Exception as e: 
                    error_str = f"Error updating snapshot lock status: {str(e)}"
                    print(error_str)
                    results.append([snapshot_id, snapshot_action, "Failure", error_str])
            else:
                # Fetch exclusive lock from snapshot (if there is one)
                try:
                    snapshot_detail = snapshots_api.retrieve_snapshot(id=snapshot_id).to_dict()
                    lock_name = snapshot_detail["resource_locks"].get("exclusive")
                    if lock_name:
                        # Unlock snapshot (if locked)
                        try:
                            request_body = {"lockName": lock_name, "forceUnlock": False}
                            response = snapshots_api.unlock_snapshot(id=snapshot_id, unlock_resource_request=request_body)
                            results.append([snapshot_id, snapshot_action, "Success", None])
                        except Exception as e: 
                            error_str = f"Error updating snapshot lock status: {str(e)}"
                            print(error_str)
                            results.append([snapshot_id, snapshot_action, "Failure", error_str])
                    else:
                        results.append([snapshot_id, snapshot_action, "Success", "No existing lock found on snapshot."])
                except Exception as e:
                    error_str = f"Error retrieving lock on snapshot: {str(e)}"
                    results.append([snapshot_id, snapshot_action, "Failure", error_str])

    # Display results
    print("\nResults:")
    results_df = pd.DataFrame(results, columns = ["snapshot", "action", "status", "errors"])
    display(results_df)

#############################################
## Input Parameters
#############################################

# Specify the action to apply to the snapshots (LOCK/UNLOCK):
snapshot_action = "UNLOCK"

# Specify the list of snapshots to apply the action to:
snapshot_id_list = [
    "c3e5c093-3156-4b4c-be3a-2c307c3d8b23"
]

#############################################
## Execution
#############################################

update_snapshot_lock_status(snapshot_action, snapshot_id_list)

## Lock/Unlock Datasets

In [None]:
#############################################
## Functions
#############################################

def update_dataset_lock_status(dataset_action, dataset_id_list):
    results = []
    # Validate dataset action
    print(f"Validating provided dataset action: {dataset_action}")
    if dataset_action not in ["LOCK", "UNLOCK"]:
        results.append(["ALL", dataset_action, "Failure", "Invalid dataset action specified. Must be LOCK or UNLOCK."])
    else:
        # Loop through and process datasets
        act = dataset_action.lower()
        for dataset_id in dataset_id_list:

            # Initialize
            print(f"Updating dataset lock status for dataset: {dataset_id}.")
            error_str = ""
            api_client = utils.refresh_tdr_api_client()
            datasets_api = data_repo_client.DatasetsApi(api_client=api_client)

            # Update dataset lock status
            if act == "lock":
                # Lock dataset
                try:
                    response = datasets_api.lock_dataset(id=dataset_id)
                    results.append([dataset_id, dataset_action, "Success", None])
                except Exception as e: 
                    error_str = f"Error updating dataset lock status: {str(e)}"
                    print(error_str)
                    results.append([dataset_id, dataset_action, "Failure", error_str])
            else:
                # Fetch exclusive lock from dataset (if there is one)
                try:
                    dataset_detail = datasets_api.retrieve_dataset(id=dataset_id).to_dict()
                    exclusive_lock_name = dataset_detail["resource_locks"].get("exclusive")
                    shared_locks = dataset_detail["resource_locks"].get("shared") 
                    if exclusive_lock_name:
                        # Unlock dataset (if locked with exclusive lock)
                        try:
                            request_body = {"lockName": exclusive_lock_name, "forceUnlock": False}
                            response = datasets_api.unlock_dataset(id=dataset_id, unlock_resource_request=request_body)
                            results.append([dataset_id, dataset_action, "Success", None])
                        except Exception as e: 
                            error_str = f"Error updating dataset lock status: {str(e)}"
                            print(error_str)
                            results.append([dataset_id, dataset_action, "Failure", error_str])
                    elif shared_locks:
                        # Unlock dataset (if locked with shared locks)
                        for lock_name in shared_locks:
                            try:
                                request_body = {"lockName": lock_name, "forceUnlock": False}
                                response = datasets_api.unlock_dataset(id=dataset_id, unlock_resource_request=request_body)
                                results.append([dataset_id, dataset_action, "Success", None])
                            except Exception as e: 
                                error_str = f"Error updating dataset lock status: {str(e)}"
                                print(error_str)
                                results.append([dataset_id, dataset_action, "Failure", error_str])
                    else:
                        results.append([dataset_id, dataset_action, "Success", "No existing lock found on dataset."])
                except Exception as e:
                    error_str = f"Error retrieving lock on dataset: {str(e)}"
                    results.append([dataset_id, dataset_action, "Failure", error_str])

    # Display results
    print("\nResults:")
    results_df = pd.DataFrame(results, columns = ["dataset", "action", "status", "errors"])
    display(results_df)

#############################################
## Input Parameters
#############################################

# Specify the action to apply to the datasets (LOCK/UNLOCK):
dataset_action = "UNLOCK"

# Specify the list of datasets to apply the action to:
dataset_id_list = [
    '32c09444-3d4a-44d5-af6b-07eef92189db',
    '280c5d6f-39a3-4d1d-aad2-a174451cd9b2',
    'e4ccd185-2b0c-445d-9c57-0dc45c8f9d7e',
    '20ddfcd5-d456-431b-9f05-781e05d873d6',
    '15d41c35-943c-474b-afa6-e1c6d6e4be2b',
    '3ef7966a-ec1e-4dba-9d31-cdb33692e78f',
    'dd6c6688-b73a-464c-86d9-3369fdf98268',
    '15b153f5-ed02-4216-8f96-99743b8b4fc3',
    '747858c0-d139-4f52-9f0e-a618b880d6d6',
    '3376a8b6-7ef6-4191-97ab-a547da0d330d',
    'f85e467a-958f-4da5-a01b-8df883e69122',
    '0b25d09e-b2d9-4452-9810-1d0ef777f9d6',
    '6ac178b7-a923-407f-8cd8-1733e1b2ebd5',
    'c9dd3578-01db-4687-9807-4f71368941d1',
    '5edcc3db-c676-412a-9506-600959bb81f2',
    'ccc524ab-d9ad-467c-a25b-9a14fb05e976',
    '4b341ba9-49a5-43a2-9b7e-cc96beb59946',
    '0c18589c-6432-4a6c-90ce-985a47a66f39',
    'e6b15b39-daba-431f-a918-e4e43e702c30',
    'aa314675-af62-41df-b5cb-3b22558e903b',
    '20741062-7d1d-44b7-bc33-39c9ad26e414',
    '69ce1be3-1815-43a4-bdd2-4696d9c8d09a',
    '76dd508c-aa80-4e54-9ac4-23b5e0545316',
    '1c6bef41-3cfa-46b2-b183-0a523e417457',
    '18716daf-4223-44a9-bba9-fc9baeef7d07',
    '373ff2e8-0f63-4179-a55c-3fe0b85556aa',
    '31e61d00-61cc-46f2-a793-8ea8dfbb0832',
    '462d992a-7c13-45ac-a6da-1254fc3a9031',
    '74d1e549-5ae8-4410-9428-f8f2cc85fa80',
]

#############################################
## Execution
#############################################

update_dataset_lock_status(dataset_action, dataset_id_list)


## TDR Dataset and/or Snapshot Deletion

In [2]:
# Function to delete a specific TDR Snapshot
def delete_snapshot(snapshot_id):
    api_client = utils.refresh_tdr_api_client()
    snapshots_api = data_repo_client.SnapshotsApi(api_client=api_client)
    print("Attempting to delete snapshot = {}".format(snapshot_id))
    try:
        delete_snapshot_result, job_id = utils.wait_for_tdr_job(snapshots_api.delete_snapshot(id=snapshot_id))
        print("Result: {}".format(delete_snapshot_result))
    except Exception as e:
        print("Error: {}".format(e))

# Function to delete a specific TDR Dataset
def delete_dataset(dataset_id):
    api_client = utils.refresh_tdr_api_client()
    datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
    print("Attempting to delete dataset = {}".format(dataset_id))
    try:
        delete_dataset_result, job_id = utils.wait_for_tdr_job(datasets_api.delete_dataset(id=dataset_id))
        print("Result: {}".format(delete_dataset_result))
    except Exception as e:
        print("Error: {}".format(e))

# Function to delete a specific TDR Dataset and all of its Snapshots
def delete_dataset_and_all_snapshots(dataset_id):
    api_client = utils.refresh_tdr_api_client()
    datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
    snapshots_api = data_repo_client.SnapshotsApi(api_client=api_client)
    print("Attempting to delete dataset = {} and all associated snapshots".format(dataset_id))
    dataset_id_list = [dataset_id]
    # Delete snapshots
    snapshot_list = snapshots_api.enumerate_snapshots(dataset_ids=dataset_id_list)
    if snapshot_list.items:
        for snapshot in snapshot_list.items:
            snapshot_id = str(snapshot.id)
            delete_snapshot(snapshot_id)
            sleep(10)
    # Delete dataset
    delete_dataset(dataset_id)

# # Delete snapshots
# snapshot_id_list = [
# ]
# for snapshot_id in snapshot_id_list:
#     delete_snapshot(snapshot_id)

# Delete datasets and all their associated snapshots
dataset_id_list = [
    'cc9b89e8-30ea-4149-9c37-3000128ad42c',
]
for dataset_id in dataset_id_list:
    delete_dataset_and_all_snapshots(dataset_id)

Attempting to delete dataset = cc9b89e8-30ea-4149-9c37-3000128ad42c and all associated snapshots
Attempting to delete dataset = cc9b89e8-30ea-4149-9c37-3000128ad42c
TDR Job ID: UawR-vXuT_u7lvFTFvaBag
Result: {'objectState': 'deleted'}


## Clean Up Outdated AnVIL TDR Service Accounts

In [None]:
valid_sa_list = [
]

# Establish credentials
creds, project = google.auth.default()
auth_req = google.auth.transport.requests.Request()
creds.refresh(auth_req)

# Get current anvil_tdr_ingest membership
group = "anvil_tdr_ingest"
group_members = requests.get(
    url=f"https://api.firecloud.org/api/groups/{group}",
    headers={"Authorization": f"Bearer {creds.token}"}
).json()

# Loop through anvil_tdr_ingest membership and remove outdated users
user_cnt = 0
success_cnt = 0
for member in group_members["membersEmails"]:
    if "tdr-ingest-sa" in member and member not in valid_sa_list:
        user_cnt += 1
        response = requests.delete(
            url=f"https://api.firecloud.org/api/groups/{group}/member/{member}",
            headers={"Authorization": f"Bearer {creds.token}"}
        )
        if response.status_code == 204:
            success_cnt += 1
print(f"Group ({group}) clean-up: ")
print(f"\t- Users to remove: {user_cnt}")
print(f"\t- Users removed successfully: {success_cnt}")

# Get current workspace membership
ws_members = requests.get(
    url=f"https://api.firecloud.org/api/workspaces/{ws_project}/{ws_name}/acl",
    headers={"Authorization": f"Bearer {creds.token}"}
).json()

# Loop through workspace membership and remove outdated users
user_cnt = 0
success_cnt = 0
for member in ws_members["acl"].keys():
    if "tdr-ingest-sa" in member and member not in valid_sa_list:
        user_cnt += 1
        payload = [{
            "email": member,
            "accessLevel": "NO ACCESS",
            "canShare": False,
            "canCompute": False
        }]
        response = requests.patch(
            url=f"https://api.firecloud.org/api/workspaces/{ws_project}/{ws_name}/acl",
            headers={"Authorization": f"Bearer {creds.token}"}, 
            json=payload
        )
        if response.status_code == 200:
            success_cnt += 1
print(f"Workspace ({ws_project}/{ws_name}) clean-up: ")
print(f"\t- Users to remove: {user_cnt}")
print(f"\t- Users removed successfully: {success_cnt}")



## Other Misc

In [None]:
!gsutil -u anvil-datastorage ls gs://fc-secure-33cad843-3453-42ea-bf50-0eda2b52171d