In [None]:
# Version History
#print("Version 1.0.0: 09/23/2022 5:06pm - Nate Calvanese - First version created")
#print("Version 1.0.1: 09/26/2022 11:18m - Nate Calvanese - Fixed bug in default dataset naming")
#print("Version 1.0.2: 09/27/2022 2:43pm - Nate Calvanese - Added ability to aggregate multiple workspaces into one dataset")
#print("Version 1.0.3: 10/5/2022 1:32pm - Nate Calvanese - Added support for chunking up ingest requests")
#print("Version 1.0.4: 10/6/2022 10:35am - Nate Calvanese - Updated use of TDR utility functions")
#print("Version 1.0.5: 10/13/2022 10:54am - Nate Calvanese - Parameter tweaks for latest changes")
#print("Version 1.0.6: 10/21/2022 10:53am - Nate Calvanese - Version stamp for latest changes to supporting notebooks")
#print("Version 1.0.7: 10/24/2022 4:58pm - Nate Calvanese - Added support for project entity name derivation")
#print("Version 1.0.8: 10/26/2022 4:24pm - Nate Calvanese - Added support for batching mapping activities in section 3")
#print('Version 1.0.9: 2/21/2023 2:50pm - Nate Calvanese - Added support for $BQ_DATASET substitution variable in mapping section')
#print('Version 1.0.10: 3/8/2023 8:17am - Nate Calvanese - Performance improvements')
#print('Version 1.0.11: 7/11/2023 8:17am - Nate Calvanese - Added auth domain back as reader on snapshots')
#print('Version 1.0.12: 9/1/2023 10:16am - Nate Calvanese - Added functionality to enable/disable secure monitoring for public datasets.')
#print('Version 1.0.13: 12/15/2023 9:00am - Nate Calvanese - Added functionality to optionally truncate tables before ingest')
print('Version 1.0.14: 1/12/2024 11:28am - Nate Calvanese - Added max_combined_rec_ref_size as a global parameter')


# Imports and Common Variables

In [None]:
# Install additional modules (one time effort per cloud environment)
#!pip install --upgrade pip import_ipynb data_repo_client urllib3 xmltodict azure-storage-blob
#!pip install data_repo_client==1.409.0

In [2]:
# Workspace environment variables
import os
import re
print("Recording workspace environment variables:")
ws_name = os.environ["WORKSPACE_NAME"]
ws_project = os.environ["WORKSPACE_NAMESPACE"]
ws_bucket = os.environ["WORKSPACE_BUCKET"]
ws_bucket_name = re.sub('^gs://', '', ws_bucket)
print(f"Workspace name = {ws_name}")
print(f"Workspace project = {ws_project}")
print(f"Workspace bucket = {ws_bucket}")
print(f"Workspace bucket name = {ws_bucket_name}")

# Copy latest version of the pipeline notebooks to the cloud environment (uncomment if any notebooks have changed since last run)
# print("\nCopying latest pipeline notebooks to the cloud environment:")
# !gsutil -m cp $ws_bucket/notebooks/*.ipynb .

# Additional imports
print("\nRunning imports:")
import import_ipynb
import pandas as pd
from firecloud import api as fapi
import data_repo_client
import ingest_pipeline_utilities as utils
import build_mapping_query as bmq
from google.cloud import storage
from google.cloud import bigquery
import google.auth
import google.auth.transport.requests
import logging
import datetime
import json
import sys
from time import sleep
import requests
from io import BytesIO
import pyarrow.parquet as pq
from azure.storage.blob import BlobClient, ContainerClient

# Common pipeline variables (AnVIL)
ws_attributes = utils.get_workspace_attributes(ws_project, ws_name)
params = {}
params["ws_name"] = ws_name
params["ws_project"] = ws_project
params["ws_bucket"] = ws_bucket
params["ws_bucket_name"] = ws_bucket_name
params["profile_id"] = "e0e03e48-5b96-45ec-baa4-8cc1ebf74c61" 
params["google_project"] = ws_attributes["googleProject"]
params["create_file_table"] = True
params["file_table_name"] = "file_inventory"
params["ingest_user_to_add"] = "tdr_sa"  # tdr_sa or anvil_tdr_ingest
params["global_file_exclusions"] = ["SubsetHailJointCall", ".vds/"]
params["max_combined_rec_ref_size"] = 40000

# Configure logging format
while logging.root.handlers:
    logging.root.removeHandler(logging.root.handlers[-1])
logging.basicConfig(format="%(asctime)s - %(levelname)s: %(message)s", datefmt="%m/%d/%Y %I:%M:%S %p", level=logging.INFO, handlers=[logging.StreamHandler(sys.stdout)])
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option("display.max_colwidth", None)
pd.set_option('display.width', 1000)
pd.set_option('display.colheader_justify', 'center')
pd.set_option('display.precision', 3)


Recording workspace environment variables:
Workspace name = anvil_workspace_ingest_resources_dev
Workspace project = dsp-data-ingest
Workspace bucket = gs://fc-2a9eefc3-0302-427f-9ac3-82f078741c03
Workspace bucket name = fc-2a9eefc3-0302-427f-9ac3-82f078741c03

Running imports:
importing Jupyter notebook from ingest_pipeline_utilities.ipynb
Version 1.0.41: 6/24/2024 9:48am - Nate Calvanese - Updated snapshot creation requests to include data access control groups.
importing Jupyter notebook from source_files_creation.ipynb
Version 1.0.9: 2/25/2023 3:15pm - Nate Calvanese - Replaced FAPI with utils functions
importing Jupyter notebook from build_file_inventory.ipynb
Version 2.0.4: 4/12/2024 2:30pm - Nate Calvanese - Fixed target path logic to remove unsupported characters
importing Jupyter notebook from process_table_data.ipynb
Version: 1.0.10: 1/12/2024 11:25am - Nate Calvanese - Made max_combined_rec_ref_size configurable
importing Jupyter notebook from build_mapping_query.ipynb
Versi

# "EL" Pipeline: Load Dataset to TDR in Source Format

## Pipeline Run Variables

In [None]:
## >>> Run Variables <<<
# For datasets split across multiple workspaces, set the staging area and target TDR dataset to the 
# same value to collect all of the source data and process it together.
workspace_run_list = [
    #["Workspace_Name", "Workspace_Project", Public (True/False), "Staging Area (Leave empty for default)", "Target_TDR_Dataset_Name (Leave empty for default)", Run (True/False)]
#     ["ANVIL_Workspace_1", "anvil-datastorage", False, "", "", False],
#     ["ANVIL_Workspace_2", "anvil-datastorage", False, "", "", False],
    ['AnVIL_HPRC', 'anvil-datastorage', True, '', 'ANVIL_HPRC_20240401', True],
]
params["skip_source_files_creation"] = False
params["skip_file_inventory_creation"] = False
params["skip_table_data_processing"] = False
params["skip_ingests"] = False
params["trunc_before_ingest"] = True
params["skip_snapshot_creation"] = True
params["snapshot_readers_list"] = ["auth-domain"] # Include "auth-domain" to add the auth domain(s) as a reader (if one exists)


## >>> File Inventory Variables <<<
# The GCS bucket associated with the source workspace will be automatically included in the file inventory build. To specify 
# additional GCS buckets to include in the file inventory build, add entries to the below dictionary.
params["additional_file_inventory_sources"] = {}
# EXAMPLE:
# params["additional_file_inventory_sources"] = {
#     "staging_area": {
#         "bucket_name": {
#             "include_dirs": [], # Leave empty to include all directories in bucket
#             "exclude_dirs": [] # Exclusions will take precedence over inclusions
#         }
#     }
# }


## >>> Ingest Variables <<<
# For cases where you only want to ingest a subset of files, use the below dictionary to specify exactly what should be ingested.
params["ingest_list_override"] = {
}
# EXAMPLE:
# params["ingest_list_override"] = {
#     "ws_table": ["ws_table_0.json"], # Leave empty to run ingest for every file for target table
# }


## >>> File Reference Variables <<<
# Fields containing GCS links will be identified automatically by the pipeline. The below dict should contain any fields
# that contain file references that aren't proper GCS links in the workspace tables.
data_file_refs_dict = {   
}
# Definitions:
#    Required Fields: column, method, mode, create_new field
#    Optional Fields: match_multiple_files (default to True), match_regex (default to None), match_type (default to 'partial'), new_field_name (default to None)
#    Methods: 
#       file_path_match -- Field contains a full or partial file path, which can be matched to the file inventory to grab the file(s) referenced 
#       tdr_file_id -- Field contains file UUIDs of files already ingested into the target TDR dataset
#    Modes:
#       fileref_in_line -- Populates the field with a file reference object
#       fileref_table_ref -- Populates the field with an ID that joins to a file table. If no file table built, falls back on fileref_in_line logic.
    
#-----------------------------------------------------------------------------------------------------------#
    
# Print variables
print("Pipeline run variables set:")
print("Profile ID: " + params["profile_id"])
print("Ingests to run: ")
current_datetime = datetime.datetime.now()
current_date_string = current_datetime.strftime("%Y%m%d")
for workspace in workspace_run_list:
    if workspace[5] == True:
        ws_attributes = utils.get_workspace_attributes(workspace[1], workspace[0])
        params["phs_id"] = utils.format_phs_id(ws_attributes["attributes"]["phs_id"]) if ws_attributes["attributes"].get("phs_id") else ""
        auth_list = ws_attributes["authorizationDomain"] if ws_attributes.get("authorizationDomain") else []
        params["auth_domains"] = [x["membersGroupName"] for x in auth_list]
        params["consent_name"] = ws_attributes["attributes"]["library:dataUseRestriction"] if ws_attributes["attributes"].get("library:dataUseRestriction") else ""
        params["data_files_src_bucket"] = ws_attributes["bucketName"] if ws_attributes.get("bucketName") else ""
        params["public_dataset"] = workspace[2]
        workspace[4] = workspace[4] if workspace[4] else utils.format_dataset_name(workspace[0])
        workspace[3] = workspace[3] if workspace[3] else workspace[0]
        print("- Workspace [" + workspace[1] + "/" + workspace[0] + "] to TDR dataset [" + workspace[4] + "] via Staging Area [" + workspace[3] + "]")
        print("\t- PHS ID = " + params["phs_id"])
        print("\t- Consent Short Name = " + params["consent_name"])
        print("\t- Auth Domains = " + str(params["auth_domains"]))
        print("\t- Public Dataset = " + str(params["public_dataset"]))
        print("\t- Data Files Source Bucket = " + params["data_files_src_bucket"])
print("Skip source files creation? " + str(params["skip_source_files_creation"]))
print("Skip file inventory creation? " + str(params["skip_file_inventory_creation"]))
print("Skip table data processing? " + str(params["skip_table_data_processing"]))
print("Skip ingests? " + str(params["skip_ingests"]))
print("Truncate tables before ingest? " + str(params["trunc_before_ingest"]))
print("Ingest override list: " + str(params["ingest_list_override"]))
print("Skip snapshot creation? " + str(params["skip_snapshot_creation"]))


## Pipeline Execution

In [None]:
# Loop through and execute workspace connector pipeline ("E") for listed workspaces
if params["skip_source_files_creation"] == True:
    logging.info("Skipping source file creation, per user request.")
else:
    for workspace in workspace_run_list:
        if workspace[5] == True:
            params["data_file_refs"] = data_file_refs_dict  
            utils.run_ws_connector_pipeline(workspace, params)

# Aggregate staging area to target dataset combinations, loop through them, and execute ingest pipeline ("L")
pipeline_run_list = []
for workspace in workspace_run_list:
    if workspace[5] == True:
        temp_list = [workspace[3], workspace[4], workspace[2]]
        if temp_list not in pipeline_run_list:
            pipeline_run_list.append(temp_list)
for pipeline in pipeline_run_list:
    utils.run_el_pipeline(pipeline, params)


# Mapping Development
Work through the following steps for each dataset that needs to be processed through the transformation pipeline in Step 4, specifying the target schema ("mapping target") and mapping specification ("mapping_target_spec") you would like to use for transformation. Note that you can use the logs or results_dict from the previous step to retrieve the dataset_id values of interest, or retrieve them directly from TDR via the UI or Swagger.

## Dataset Mapping Variables

In [3]:
## >>> Mapping Variables <<<
# For each dataset specified, include an appropriate mapping target and mapping target specification
datasets_to_map_list = [
    #["dataset_id", "mapping_target", "mapping_target_spec", Run (True/False)]
    ['cefc1a79-446c-40d2-b140-ba8d8b1c0712', 'anvil', 'gregor_1', True],
    ['2355554e-8951-4b41-bcd8-32e18cddb7c9', 'anvil', 'gregor_1', True],
]

#-----------------------------------------------------------------------------------------------------------#
    
# Print variables
print("Datasets to map: ")
api_client = utils.refresh_tdr_api_client()
api_client.client_side_validation = False
datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
final_datasets_to_map_dict = {}
skip_dataset_list_access = []
skip_dataset_list_mapping = []
storage_client = storage.Client()
bucket = storage_client.get_bucket(ws_bucket_name)
for dataset in datasets_to_map_list:
    if dataset[3]:
        dataset_id = dataset[0]
        mapping_target = dataset[1]
        mapping_target_spec = dataset[2]
        try:
            dataset_info = datasets_api.retrieve_dataset(id=dataset_id, include=["SCHEMA", "ACCESS_INFORMATION", "PROPERTIES"]).to_dict()
            dataset_name = dataset_info["name"]
        except:
            dataset_name = ""
            skip_dataset_list_access.append(dataset_id)
        try:
            blob = bucket.blob("ingest_pipeline/mapping/{}/mapping_schema_object.json".format(mapping_target))
            content = json.loads(blob.download_as_string(client=None))
            blob = bucket.blob("ingest_pipeline/mapping/{}/{}/mapping_specification.json".format(mapping_target, mapping_target_spec))
            content = json.loads(blob.download_as_string(client=None))
        except:
            skip_dataset_list_mapping.append(dataset_id)
        if dataset_id not in skip_dataset_list_access and dataset_id not in skip_dataset_list_mapping:
            final_datasets_to_map_dict[dataset_id] = {}
            final_datasets_to_map_dict[dataset_id]["mapping_target"] = mapping_target 
            final_datasets_to_map_dict[dataset_id]["mapping_target_spec"] = mapping_target_spec
            print("\t- " + dataset_name + " ({})".format(dataset_id) + " with {}/{}".format(mapping_target, mapping_target_spec))
if skip_dataset_list_access:
    print("Datasets to skip due to non-existence or inaccessibility to the current user:")
    print("\t- " + "\n\t- ".join(skip_dataset_list_access))
if skip_dataset_list_mapping:
    print("Datasets to skip due to invalid mapping target or mapping target specification:")
    print("\t- " + "\n\t- ".join(skip_dataset_list_mapping))   


Datasets to map: 
	- ANVIL_GREGoR_R01_GRU_20240208 (cefc1a79-446c-40d2-b140-ba8d8b1c0712) with anvil/gregor_1
	- ANVIL_GREGoR_R01_HMB_20240208 (2355554e-8951-4b41-bcd8-32e18cddb7c9) with anvil/gregor_1


## Add Missing Relationships to TDR Dataset Schema
Relationships are needed by the mapping query constructor to build appropriate joins between tables. If no joins are required between tables, this step is unnecessary. 

In [4]:
# Record relationships to potentially add to the source datasets. Note that there may be more relationships to add
# than those listed below, so add to this list as necessary.
potential_relationships = [
    ["subject.family_id", "family.family_id"],
    ["sample.subject_id", "subject.subject_id"],
    ["sample.t_01_subject_id", "subject.subject_id"],
    ["sequencing.sample_id", "sample.sample_id"],
    ["sequencing.sample", "sample.sample_id"],
    ["sequencing.sample_alias", "sample.sample_id"],
    ["sample.participant", "participant.participant_id"],
    ["sample.participant_id", "participant.participant_id"],
    ["discovery.sample_id", "sample.sample_id"],
    ["discovery.subject_id", "subject.subject_id"],
    ["qc_result_sample.qc_result_sample_id", "sample.sample_id"],
    ["interval.chromosome", "chromosome.chromosome_id"],
    ["analyte.participant_id", "participant.participant_id"],
    ["participant.family_id", "family.family_id"],
    ["phenotype.participant_id", "participant.participant_id"],
    ["experiment_rna_short_read.analyte_id", "analyte.analyte_id"],
    ["experiment_dna_short_read.analyte_id", "analyte.analyte_id"],
    ["aligned_rna_short_read.experiment_rna_short_read_id", "experiment_rna_short_read.experiment_rna_short_read_id"],
    ["aligned_dna_short_read.experiment_dna_short_read_id", "experiment_dna_short_read.experiment_dna_short_read_id"],
    ["aligned_dna_short_read_set.aligned_dna_short_reads", "aligned_dna_short_read.aligned_dna_short_read_id"],
    ["called_variants_dna_short_read.aligned_dna_short_read_set_id", "aligned_dna_short_read_set.aligned_dna_short_read_set_id"],
    
]

# Loop through datasets and process potential relationship additions
results = []
api_client = utils.refresh_tdr_api_client()
datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
for dataset_id in final_datasets_to_map_dict:
    print("Processing potential relationships for dataset_id = {}".format(dataset_id))
    
    # Retrieve source schema
    src_schema_dict = {}
    try:
        datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
        response = datasets_api.retrieve_dataset(id=dataset_id, include=["SCHEMA", "ACCESS_INFORMATION"]).to_dict()
        src_schema_dict["tables"] = response["schema"]["tables"]
        src_schema_dict["relationships"] = response["schema"]["relationships"]
    except Exception as e:
        print("Error retrieving source schema from TDR. Error: {}".format(e))
        results.append([dataset_id, "Error"])

    # Loop through potential relationships and add those present for the source dataset
    additional_relationships = []
    for rel in potential_relationships:
        from_table = rel[0].split(".")[0] 
        from_column = rel[0].split(".")[1]
        to_table = rel[1].split(".")[0]
        to_column = rel[1].split(".")[1]
        if bmq.confirm_column_exists(src_schema_dict, from_table, from_column) and bmq.confirm_column_exists(src_schema_dict, to_table, to_column):
            relationship_found = False
            for rel_entry in src_schema_dict["relationships"]:
                if rel_entry["_from"]["table"] == from_table and rel_entry["_from"]["column"] == from_column and rel_entry["to"]["table"] == to_table and rel_entry["to"]["column"] == to_column:
                    relationship_found = True
                elif rel_entry["_from"]["table"] == to_table and rel_entry["_from"]["column"] == to_column and rel_entry["to"]["table"] == from_table and rel_entry["to"]["column"] == from_column:
                    relationship_found = True
            if not relationship_found:
                rel_dict = {
                    "name": from_table + "_" + from_column + "__to__" + to_table + "_" + to_column,
                    "from": {"table": from_table, "column": from_column},
                    "to": {"table": to_table, "column": to_column}
                }
                additional_relationships.append(rel_dict)

    # Submit the schema update request for the TDR dataset
    if additional_relationships:
        schema_update_request = {
            "description": "Adding relationships to support query construction.",
            "changes": {
                "addRelationships": additional_relationships
            }
        }
        try:
            resp = utils.wait_for_tdr_job(datasets_api.update_schema(id=dataset_id, dataset_schema_update_model=schema_update_request))
            print("Schema update successful: " + str(resp)[0:1000])
            results.append([dataset_id, "Success"])
        except Exception as e:
            print("Error running schema update: " + str(e))
            results.append([dataset_id, "Error"])
    else:
        print("No additional relationships to add to schema.")
        results.append([dataset_id, "Success"])

print("Processing of potential relationships for specified datasets complete.")
print("\nResults:")
results_df = pd.DataFrame(results, columns = ["dataset", "status"])
display(results_df)


Processing potential relationships for dataset_id = cefc1a79-446c-40d2-b140-ba8d8b1c0712
TDR Job ID: 0frXGrdZQT6ztryImkK2lg
Schema update successful: ({'id': 'cefc1a79-446c-40d2-b140-ba8d8b1c0712', 'name': 'ANVIL_GREGoR_R01_GRU_20240208', 'description': 'TDR Dataset for AnVIL_GREGoR_R01_GRU', 'defaultProfileId': None, 'dataProject': None, 'defaultSnapshotId': None, 'schema': {'tables': [{'name': 'experiment', 'columns': [{'name': 'experiment_id', 'datatype': 'string', 'array_of': False, 'required': False}, {'name': 'id_in_table', 'datatype': 'string', 'array_of': False, 'required': False}, {'name': 'participant_id', 'datatype': 'string', 'array_of': False, 'required': False}, {'name': 'table_name', 'datatype': 'string', 'array_of': False, 'required': False}, {'name': 'ingest_provenance', 'datatype': 'string', 'array_of': False, 'required': False}], 'primaryKey': [], 'partitionMode': 'none', 'datePartitionOptions': None, 'intPartitionOptions': None, 'rowCount': None}, {'name': 'family',

Unnamed: 0,dataset,status
0,cefc1a79-446c-40d2-b140-ba8d8b1c0712,Success
1,2355554e-8951-4b41-bcd8-32e18cddb7c9,Success


## Retrieve Mapping Artifacts and Run Query Construction
Retrieve the artifacts you would like to use to construct transformation queries for your datasets, based on the previously specified target schema and mapping specification. These transformation queries will then be dynamically constructed based on the appropriate target schema, mapping specification, and source schema. 

In [None]:
# Loop through datasets and process transformation query construction
api_client = utils.refresh_tdr_api_client()
datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
results = []
for dataset_id in final_datasets_to_map_dict:
    print("Building transformation queries for dataset_id = {}".format(dataset_id))

    # Collect mapping variables
    mapping_target = final_datasets_to_map_dict[dataset_id]["mapping_target"]
    mapping_target_spec = final_datasets_to_map_dict[dataset_id]["mapping_target_spec"]
    
    # Retrieve source schema
    src_schema_dict = {}
    try:
        datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
        response = datasets_api.retrieve_dataset(id=dataset_id, include=["SCHEMA", "ACCESS_INFORMATION"]).to_dict()
        src_schema_dict["name"] = response["name"]
        src_schema_dict["tables"] = response["schema"]["tables"]
        src_schema_dict["relationships"] = response["schema"]["relationships"]
        bq_project = response["access_information"]["big_query"]["project_id"]
        bq_schema = response["access_information"]["big_query"]["dataset_name"]
        phs_id = response["phs_id"]
    except Exception as e:
        print("Error retrieving source schema from TDR. Error: {}".format(e))

    # Set dataset name and project name parameters to substitute into transform queries
    dataset_name_value = re.sub("(_[0-9]+$)", "", src_schema_dict["name"])
    project_name_value = re.sub("'", "", utils.derive_project_name(dataset_id, phs_id, dataset_name_value))

    # Retrieve target schema and mapping specification
    target_schema_dict = {}
    mapping_spec = {}
    storage_client = storage.Client()
    bucket = storage_client.get_bucket(ws_bucket_name)
    try:
        blob = bucket.blob(f"ingest_pipeline/mapping/{mapping_target}/mapping_schema_object.json")
        target_schema_dict = json.loads(blob.download_as_string(client=None))
    except Exception as e:
        print("Error retrieving target schema for specified mapping_target. Error: {}".format(e))
    try:
        blob = bucket.blob(f"ingest_pipeline/mapping/{mapping_target}/{mapping_target_spec}/mapping_specification.json")
        blob_string = blob.download_as_text(client=None)
        blob_string = blob_string.replace("$DATASET_NAME", dataset_name_value)
        blob_string = blob_string.replace("$PROJECT_NAME", project_name_value)
        blob_string = blob_string.replace("$BQ_DATASET", bq_project + "." + bq_schema)
        mapping_spec = json.loads(blob_string)
    except Exception as e:
        print("Error retrieving mapping specification for specified mapping_target and mapping_target_spec. Error: {}".format(e))

    # Build queries from mapping specification
    query_dict = {}
    if target_schema_dict:
        for target_table in target_schema_dict["tables"]:
            table_name = target_table["name"]
            missing_artifacts = False
            if src_schema_dict and mapping_spec:
                query_dict[table_name] = bmq.build_mapping_query(target_table, src_schema_dict, mapping_spec, bq_project, bq_schema)
            else:
                missing_artifacts = True
                query_dict[table_name] = {"query": "", "syntax_check": ""} 
        if missing_artifacts == True:
            print("Source schema dictionary and/or mapping specification missing. Unable to generate queries.")
            results.append([dataset_id, "Error"])
    else:
        print("Target schema dictionary missing. Unable to generate queries.")
        results.append([dataset_id, "Error"])
    
    # Evaluate queries -- Publish if no issues found, otherwise convert to dataframe and display
    failure_count = 0
    for key, val in query_dict.items():
        if val["syntax_check"] != "Passed" and val["syntax_check"] != None:
            failure_count += 1
    if failure_count == 0:
        print("No failures found in query construction, publishing to the cloud.")
        results.append([dataset_id, "Success"])
        # Copy target schema file to output folder for mapping target
        source_path = "ingest_pipeline/mapping/{}/mapping_schema_object.json".format(mapping_target)
        destination_path = "ingest_pipeline/output/transformed/{}/{}/schema/mapping_schema_object.json".format(mapping_target, dataset_id)
        !gsutil cp $ws_bucket/$source_path $ws_bucket/$destination_path 2> stdout

        # Limit query dict to valid queries, write out, and copy to output folder for mapping target
        valid_query_dict = {}
        for target, val in query_dict.items():
            if val["syntax_check"] == "Passed":
                valid_query_dict[target] = val
        final_query_dict = {
            "dataset_id": dataset_id,
            "transforms": valid_query_dict
        }
        query_dict_json = json.dumps(final_query_dict)
        query_output_file = "transform_query_set.json"
        with open(query_output_file, 'w') as outfile:
            outfile.write(query_dict_json)
        destination_path = "ingest_pipeline/output/transformed/{}/{}/queries".format(mapping_target, dataset_id)
        !gsutil cp $query_output_file $ws_bucket/$destination_path/ 2> stdout
    else:
        print("Failures found in query construction, must be resolved before publishing.")
        print("Query building results:")
        results.append([dataset_id, "Error"])
        query_df = pd.DataFrame.from_dict(query_dict, orient="index")
        query_df.index.name = "target_table"
        query_df.reset_index(inplace=True)
        display(query_df)

print("Transformation query construction and processing complete.")
print("\nResults:")
results_df = pd.DataFrame(results, columns = ["dataset", "status"])
display(results_df)


## Evaluate Vocabulary Mapping
For target attributes leveraging the "VOCAB_MAP" transformation, evaluate whether the source values have a record in the dsp-data-ingest.transform_resources.vocab_map table. If additional mappings are needed, these should be put into place before the transformation queries are executed.

In [None]:
# Set display parameter
show_only_missing_maps = True

# Loop through datasets and process vocabulary mapping evaluation
api_client = utils.refresh_tdr_api_client()
datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
for dataset_id in final_datasets_to_map_dict:
    print("Evaluating vocabulary mapping for dataset_id = {}".format(dataset_id))

    # Collect mapping variables
    mapping_target = final_datasets_to_map_dict[dataset_id]["mapping_target"]
    mapping_target_spec = final_datasets_to_map_dict[dataset_id]["mapping_target_spec"]
    
    # Retrieve source schema
    src_schema_dict = {}
    try:
        datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
        response = datasets_api.retrieve_dataset(id=dataset_id, include=["SCHEMA", "ACCESS_INFORMATION"]).to_dict()
        src_schema_dict["name"] = response["name"]
        src_schema_dict["tables"] = response["schema"]["tables"]
        src_schema_dict["relationships"] = response["schema"]["relationships"]
        bq_project = response["access_information"]["big_query"]["project_id"]
        bq_schema = response["access_information"]["big_query"]["dataset_name"]
        phs_id = response["phs_id"]
    except Exception as e:
        print("Error retrieving source schema from TDR. Error: {}".format(e))

    # Set dataset name and project name parameters to substitute into transform queries
    dataset_name_value = re.sub("(_[0-9]+$)", "", src_schema_dict["name"])
    project_name_value = utils.derive_project_name(dataset_id, phs_id, dataset_name_value)

    # Retrieve target schema and mapping specification
    target_schema_dict = {}
    mapping_spec = {}
    storage_client = storage.Client()
    bucket = storage_client.get_bucket(ws_bucket_name)
    try:
        blob = bucket.blob(f"ingest_pipeline/mapping/{mapping_target}/mapping_schema_object.json")
        target_schema_dict = json.loads(blob.download_as_string(client=None))
    except Exception as e:
        print("Error retrieving target schema for specified mapping_target. Error: {}".format(e))
    try:
        blob = bucket.blob(f"ingest_pipeline/mapping/{mapping_target}/{mapping_target_spec}/mapping_specification.json")
        blob_string = blob.download_as_text(client=None)
        blob_string = blob_string.replace("$DATASET_NAME", dataset_name_value)
        blob_string = blob_string.replace("$PROJECT_NAME", project_name_value)
        mapping_spec = json.loads(blob_string)
    except Exception as e:
        print("Error retrieving mapping specification for specified mapping_target and mapping_target_spec. Error: {}".format(e))

    # Evaluate vocab mapping and display results
    df = bmq.evaluate_vocab_mapping(mapping_spec, src_schema_dict, target_schema_dict, bq_project, bq_schema)
    print("-------------------------------------------")
    print("Missing mapped_value view:")
    print("-------------------------------------------")
    display(df[df["mapped_value"].isnull() & df["source_value"].notnull()])
    if not show_only_missing_maps:
        print("\n-------------------------------------------")
        print("Full view:")
        print("-------------------------------------------")
        display(df)
    
print("Vocabulary mapping evaluation and processing complete.")


## [Optional] Update/Override Generated Queries as Necessary
Review any queries that have not passed the syntax check, as these need to be remedied before they can be published and executed. Any other queries that do not align with expectations can be overridden by either A) Updating the mapping target specification and re-running the previous step, or B) Manually overriding the query below. Option B should only be used in one-off cases.

### Build Base Query Dictionary

In [None]:
# Input the appropriate dataset and mapping target specification
dataset_id = "f1e1ef01-d52d-423e-a65b-3a1d26c7ee9d"
mapping_target = "anvil"
mapping_target_spec = "cmg_ext_2"

# Retrieve source schema
src_schema_dict = {}
api_client = utils.refresh_tdr_api_client()
datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
try:
    datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
    response = datasets_api.retrieve_dataset(id=dataset_id, include=["SCHEMA", "ACCESS_INFORMATION"]).to_dict()
    src_schema_dict["name"] = response["name"]
    src_schema_dict["tables"] = response["schema"]["tables"]
    src_schema_dict["relationships"] = response["schema"]["relationships"]
    bq_project = response["access_information"]["big_query"]["project_id"]
    bq_schema = response["access_information"]["big_query"]["dataset_name"]
    phs_id = response["phs_id"]
except Exception as e:
    print("Error retrieving source schema from TDR. Error: {}".format(e))

# Set dataset name and project name parameters to substitute into transform queries
dataset_name_value = re.sub("(_[0-9]+$)", "", src_schema_dict["name"])
project_name_value = utils.derive_project_name(dataset_id, phs_id, dataset_name_value)

# Retrieve target schema and mapping specification
target_schema_dict = {}
mapping_spec = {}
storage_client = storage.Client()
bucket = storage_client.get_bucket(ws_bucket_name)
try:
    blob = bucket.blob(f"ingest_pipeline/mapping/{mapping_target}/mapping_schema_object.json")
    target_schema_dict = json.loads(blob.download_as_string(client=None))
except Exception as e:
    print("Error retrieving target schema for specified mapping_target. Error: {}".format(e))
try:
    blob = bucket.blob(f"ingest_pipeline/mapping/{mapping_target}/{mapping_target_spec}/mapping_specification.json")
    blob_string = blob.download_as_text(client=None)
    blob_string = blob_string.replace("$DATASET_NAME", dataset_name_value)
    blob_string = blob_string.replace("$PROJECT_NAME", project_name_value)
    mapping_spec = json.loads(blob_string)
except Exception as e:
    print("Error retrieving mapping specification for specified mapping_target and mapping_target_spec. Error: {}".format(e))

# Build queries from mapping specification
query_dict = {}
if target_schema_dict:
    for target_table in target_schema_dict["tables"]:
        table_name = target_table["name"]
        missing_artifacts = False
        if src_schema_dict and mapping_spec:
            query_dict[table_name] = bmq.build_mapping_query(target_table, src_schema_dict, mapping_spec, bq_project, bq_schema)
        else:
            missing_artifacts = True
            query_dict[table_name] = {"query": "", "syntax_check": ""} 
    if missing_artifacts == True:
        print("Source schema dictionary and/or mapping specification missing. Unable to generate queries.")
else:
    print("Target schema dictionary missing. Unable to generate queries.")
    
# Display query dictionary
query_df = pd.DataFrame.from_dict(query_dict, orient="index")
query_df.index.name = "target_table"
query_df.reset_index(inplace=True)
display(query_df)
    


### Update Query Dict as Necessary

In [None]:
# To update the query definition for particular target table, input the target table and query below
target_table = "anvil_donor"
query = "SELECT 1"

# Run syntax check
query_dict[target_table]["query"] = query
query_dict[target_table]["syntax_check"] = bmq.run_syntax_check(query)
print(query_dict[target_table])


### Publish Updated Query Dict

In [None]:
# Copy target schema file to output folder for mapping target
source_path = "ingest_pipeline/mapping/{}/mapping_schema_object.json".format(mapping_target)
destination_path = "ingest_pipeline/output/transformed/{}/{}/schema/mapping_schema_object.json".format(mapping_target, dataset_id)
!gsutil cp $ws_bucket/$source_path $ws_bucket/$destination_path 2> stdout

# Limit query dict to valid queries, write out, and copy to output folder for mapping target
valid_query_dict = {}
for target, val in query_dict.items():
    if val["syntax_check"] == "Passed":
        valid_query_dict[target] = val
final_query_dict = {
    "dataset_id": dataset_id,
    "transforms": valid_query_dict
}
query_dict_json = json.dumps(final_query_dict)
query_output_file = "transform_query_set.json"
with open(query_output_file, 'w') as outfile:
    outfile.write(query_dict_json)
destination_path = "ingest_pipeline/output/transformed/{}/{}/queries".format(mapping_target, dataset_id)
!gsutil cp $query_output_file $ws_bucket/$destination_path/ 2> stdout

# "T" Pipeline: Load Additional Transformed Tables to TDR

## Pipeline Run Variables

In [None]:
# Run Variables
dataset_id_run_list = [
    #["dataset_id", Run (True/False)],   
    ['a36eeaf7-d6dd-4887-bdbd-e435a07ba156', True],
]
params["mapping_target"] = "anvil"
params["skip_transforms"] = True
params["transform_list_override"] = [] # Leave empty to run transforms for all files, otherwise populate with target table names 
params["skip_schema_extension"] = True
params["skip_ingests"] = True
params["trunc_before_ingest"] = True
params["ingest_list_override"] = [] # Leave empty to run ingests for all files, otherwise populate with target table names
params["skip_file_relation_inference"] = True
params["skip_dangling_fk_resolution"] = True
params["skip_supplementary_file_identification"] = True
params["skip_snapshot_creation"] = False
params["snapshot_readers_list"] = ["azul-anvil-prod@firecloud.org", "auth-domain"] # Include "auth-domain" to add the auth domain(s) as a reader (if one exists)
params["skip_data_validation"] = True

#-----------------------------------------------------------------------------------------------------------#

# Print variables
print("Pipeline run variables set:")
print("Profile ID: " + params["profile_id"])
print("Mapping Target: " + params["mapping_target"])
print("Datasets to run: ")
api_client = utils.refresh_tdr_api_client()
datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
dataset_skip_list = []
for dataset in dataset_id_run_list:
    if dataset[1]:
        dataset_id = dataset[0]
        try:
            dataset_info = datasets_api.retrieve_dataset(id=dataset_id, include=["SCHEMA", "ACCESS_INFORMATION", "PROPERTIES"]).to_dict()
            dataset_name = dataset_info["name"]
            phs_id = dataset_info["phs_id"]
            consent_name = dataset_info["properties"]["consent_name"]
            auth_domains = dataset_info["properties"]["auth_domains"]
            src_workspaces = dataset_info["properties"]["source_workspaces"]
        except:
            dataset_name = ""
            dataset_skip_list.append(dataset_id)
        if dataset_name:
            dataset_id = dataset[0]
            print("- " + dataset_name + " ({})".format(dataset_id))
            print("\t- PHS ID = " + phs_id)
            print("\t- Consent Short Name = " + consent_name)
            print("\t- Auth Domains = " + str(auth_domains))
            print("\t- Source Workspaces = " + str(src_workspaces))
if dataset_skip_list:
    print("Datasets to skip (they either don't exist or aren't accessible to the current user): ")
    print("\t- " + "\n\t- ".join(dataset_skip_list)) 
print("Skip transforms? " + str(params["skip_transforms"]))
print("Transforms override list: " + str(params["transform_list_override"]))
print("Skip schema extension? " + str(params["skip_schema_extension"]))
print("Skip ingests? " + str(params["skip_ingests"]))
print("Truncate tables before ingest? " + str(params["trunc_before_ingest"]))
print("Ingest override list: " + str(params["ingest_list_override"]))
print("Skip file relationship inference? " + str(params["skip_file_relation_inference"]))
print("Skip dangling foreign key resolution? " + str(params["skip_dangling_fk_resolution"]))
print("Skip supplementary file identification? " + str(params["skip_supplementary_file_identification"]))
print("Skip snapshot creation? " + str(params["skip_snapshot_creation"]))
print("Skip data validation? " + str(params["skip_data_validation"]))


## Pipeline Execution

In [None]:
# Loop through and execute pipeline for listed workspaces
for dataset in dataset_id_run_list:
    if dataset[1]:
        dataset_id = dataset[0]
        try:
            api_client = utils.refresh_tdr_api_client()
            datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
            dataset_info = datasets_api.retrieve_dataset(id=dataset_id, include=["SCHEMA", "ACCESS_INFORMATION", "PROPERTIES"]).to_dict()
            dataset_name = dataset_info["name"]
            phs_id = dataset_info["phs_id"]
            consent_name = dataset_info["properties"]["consent_name"]
            auth_domains = dataset_info["properties"]["auth_domains"]
            src_workspaces = dataset_info["properties"]["source_workspaces"]
        except:
            dataset_name = ""
        if dataset_name:
            params["dataset_id"] = dataset_id
            params["dataset_name"] = dataset_name
            params["phs_id"] = phs_id
            params["consent_name"] = consent_name
            params["auth_domains"] = auth_domains
            utils.run_t_pipeline(params)
        

# Utility Scripts
Uncomment sections as necessary to accomplish various miscellaneous tasks.

## Collect AnVIL Snapshots and Datasets

In [None]:
# Dataset_ID Filter
dataset_id_list = [
]

# Collect Anvil datasets and snapshots
current_datetime_string = datetime.datetime.now().strftime("%Y%m%d %H:%M:%S")
logging.info(f"Start time: {current_datetime_string}")
api_client = utils.refresh_tdr_api_client()
datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
snapshots_api = data_repo_client.SnapshotsApi(api_client=api_client)
datasets_list = datasets_api.enumerate_datasets(filter="anvil", limit=2000)
if dataset_id_list:
    dataset_list_len = min(len(datasets_list.items), len(dataset_id_list))
else:
    dataset_list_len = len(datasets_list.items)
records_list = []
dataset_count = 0
for dataset_entry in datasets_list.items:
    if len(dataset_id_list) == 0 or dataset_entry.id in dataset_id_list:
        dataset_count += 1
        logging.info(f"Processing dataset {dataset_count} of {dataset_list_len}")
        if re.match("^ANVIL_[a-zA-Z0-9-_]+_[0-9]{8}", dataset_entry.name.upper()):
            dataset_detail = datasets_api.retrieve_dataset(id=dataset_entry.id, include=["PROPERTIES", "DATA_PROJECT"])
            snapshots_list = snapshots_api.enumerate_snapshots(dataset_ids=[dataset_entry.id], limit=1000)
            try:
                source_workspace = ", ".join(dataset_detail.properties["source_workspaces"])
            except:
                source_workspace = ""
            if len(snapshots_list.items) == 0:
                record = [None, None, None, None, None, None, None, None, None, dataset_entry.id, dataset_entry.name, dataset_detail.ingest_service_account, dataset_entry.created_date[0:10], dataset_entry.cloud_platform, dataset_entry.secure_monitoring_enabled, source_workspace]
                records_list.append(record)
            else:
                snapshot_list_len = len(snapshots_list.items)
                snapshot_count = 0
                for snapshot_entry in snapshots_list.items:
                    snapshot_count += 1
                    logging.info(f"Processing snapshot {snapshot_count} of {snapshot_list_len} for dataset {dataset_count}")
                    # Get public policy information
                    creds, project = google.auth.default()
                    auth_req = google.auth.transport.requests.Request()
                    creds.refresh(auth_req)
                    public_flag = "N"
                    public_response = requests.get(
                        url=f"https://sam.dsde-prod.broadinstitute.org/api/resources/v2/datasnapshot/{snapshot_entry.id}/policies/reader/public",
                        headers={"Authorization": f"Bearer {creds.token}"},
                    )
                    if public_response.text == "true":
                        public_flag = "Y"
                    # Get snapshot DUOS ID and Lock status
                    api_client = utils.refresh_tdr_api_client()
                    snapshots_api = data_repo_client.SnapshotsApi(api_client=api_client)
                    snapshot_detail = snapshots_api.retrieve_snapshot(id=snapshot_entry.id, include=["DUOS"])
                    duos_id = ""
                    if snapshot_detail.duos_firecloud_group:
                        duos_id = snapshot_detail.duos_firecloud_group.duos_id
                    lock_name = snapshot_detail.resource_locks.exclusive
                    if lock_name:
                        lock_status = True
                    else:
                        lock_status = False
                    # Get snapshot readers and auth domain
                    snapshot_policy_response = snapshots_api.retrieve_snapshot_policies(id=snapshot_entry.id)
                    for role in snapshot_policy_response.policies:
                        if role.name == "reader":
                            readers = ", ".join(role.members)
                    ad_groups = ""
                    if snapshot_policy_response.auth_domain:
                        ad_groups = ", ".join(snapshot_policy_response.auth_domain)
                    record = [snapshot_entry.id, snapshot_entry.name, snapshot_entry.created_date[0:10], public_flag, readers, ad_groups, duos_id, snapshot_entry.data_project, lock_status, dataset_entry.id, dataset_entry.name, dataset_detail.ingest_service_account, dataset_entry.created_date[0:10], dataset_entry.cloud_platform, dataset_entry.secure_monitoring_enabled, source_workspace]
                    records_list.append(record)
df = pd.DataFrame(records_list, columns =["Snapshot ID", "Snapshot Name", "Snapshot Created Date", "Snapshot Public", "Snapshot Readers", "Snapshot Auth Domain", "Snapshot DUOS ID", "Snapshot Data Project", "Snapshot Locked", "Source Dataset ID", "Source Dataset Name", "Source Dataset SA", "Source Dataset Created Date", "Cloud Platform", "Secure Monitoring", "Source Workspace"])
df_sorted = df.sort_values(["Source Workspace", "Source Dataset Name", "Snapshot Name"], ascending=[True, True, True], ignore_index=True)
current_datetime_string = datetime.datetime.now().strftime("%Y%m%d %H:%M:%S")
logging.info(f"End time: {current_datetime_string}")
display(df_sorted)


## Soft Deletion of TDR Dataset Records

In [None]:
# Input parameters
dataset_id_list = [
    'f9224ea2-dd31-421d-80d4-f35082ef8d68',
    'd7bcfc5d-e258-4bd6-a413-bb7a118e6bff',
    '6d18aafc-0240-499c-902e-a72a5b98ff0a',
    '033fc1e1-0337-4656-bbe1-3f06fef641e9',
]
table_list = ["anvil_activity", "anvil_alignmentactivity", "anvil_antibody", "anvil_assayactivity", "anvil_biosample", "anvil_dataset", "anvil_diagnosis", "anvil_donor", "anvil_file", "anvil_project", "anvil_sequencingactivity", "anvil_variantcallingactivity"]
#table_list = ["file_inventory", "sample", "subject", "workspace_attributes", "sequencing", "qc_result_sample", "family", "chromosome", "interval", "participant", "discovery", "sample_set", "vcf"]
#table_list = ['file_inventory']
delete_all_records = True
delete_record_list = [] # Will be ignored if delete_all_records is set to True

#--------------------------------------------------------------------------------------------------------

# Function to delete rows from a dataset
def delete_datarepo_rows(dataset_id, table_name, datarepo_row_ids):
    print("Attempting to delete specified rows from {} for dataset {}".format(table_name, dataset_id))
    if datarepo_row_ids:
        data_deletion_payload = {
            "deleteType": "soft",
            "specType": "jsonArray",
            "tables": [{
              "tableName": table_name,
              "jsonArraySpec": {
                "rowIds": datarepo_row_ids
              }
            }]
        }
        try:
            api_client = utils.refresh_tdr_api_client()
            datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
            data_deletion_result, job_id = utils.wait_for_tdr_job(datasets_api.apply_dataset_data_deletion(id=dataset_id, data_deletion_request=data_deletion_payload))
            print("Result: {}".format(data_deletion_result))
        except Exception as e:
            print("Error: {}".format(str(e)))
    else:
        print("No datarepo_row_ids specified for deletion.")

# Function to collect all datarepo rows for a particular table within a dataset
def collect_all_datarepo_rows(dataset_id, table_name):
    try:
        api_client = utils.refresh_tdr_api_client()
        datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
        response = datasets_api.retrieve_dataset(id=dataset_id, include=["ACCESS_INFORMATION"]).to_dict()
        if response["access_information"]["big_query"]: 
            cloud = "gcp"
            bq_project = response["access_information"]["big_query"]["project_id"]
            bq_schema = response["access_information"]["big_query"]["dataset_name"]
        else:
            cloud = "azure"
            for parquet_table in response["access_information"]["parquet"]["tables"]:
                if parquet_table["name"] == table_name:
                    sas_url = parquet_table["url"] + "?" + parquet_table["sas_token"]
                    break
    except Exception as e:
        print("Error retrieving dataset information: {}".format(str(e)))
    if cloud == "gcp":
        client = bigquery.Client()
        query = "SELECT datarepo_row_id FROM `{project}.{schema}.{table}`".format(project = bq_project, schema = bq_schema, table = table_name)
        try:
            query_job = client.query(query)
            results = [row["datarepo_row_id"] for row in query_job]
            return results
        except Exception as e:
            print("Error retrieving datarepo_row_id list: {}".format(str(e)))
    else:
#         blob_client = BlobClient.from_blob_url(sas_url)
#         downloaded_blob = blob_client.download_blob()
#         bytes_io = BytesIO(downloaded_blob.readall())
#         df_blob = pd.read_parquet(bytes_io)
        retrieval_error = False
        max_page_size = 1000
        records_fetched = 0 
        total_record_count = 1
        results = []
        while records_fetched < total_record_count and not retrieval_error:
            row_start = records_fetched
            attempt_counter = 0
            while True:
                payload = {
                  "offset": row_start,
                  "limit": max_page_size,
                  "sort": "datarepo_row_id",
                  "direction": "asc",
                  "filter": ""
                }
                try:
                    dataset_results = datasets_api.query_dataset_data_by_id(id=dataset_id, table=table_name, query_data_request_model=payload).to_dict() 
                    total_record_count = dataset_results["total_row_count"]
                    for record in dataset_results["result"]:
                        results.append(record["datarepo_row_id"])
                        records_fetched += 1
                    break
                except Exception as e:
                    if attempt_counter < 5:
                        sleep(10)
                        attempt_counter += 1
                        continue
                    else:
                        warn_str = "Error retrieving data_repo_row_ids for table."
                        logging.warning(warn_str)
                        retrieval_error = True
                        break
        return results
    
# Function to loop through datasets and delete
def execute_deletions(dataset_id_list, table_list, delete_all_records, delete_record_list):
    for dataset_id in dataset_id_list:
        print(f"Processing record deletions for dataset {dataset_id}")
        for table in table_list:
            print(f"Processing record deletion for {table}")
            if delete_all_records:
                datarepo_row_ids = collect_all_datarepo_rows(dataset_id, table)
            else:
                datarepo_row_ids = delete_record_list
            if datarepo_row_ids:
                delete_datarepo_rows(dataset_id, table, datarepo_row_ids)
            else:
                print("No records specified for deletion.")
                
#--------------------------------------------------------------------------------------------------------

execute_deletions(dataset_id_list, table_list, delete_all_records, delete_record_list)              



## Lock/Unlock Snapshots

In [None]:
#############################################
## Functions
#############################################

def update_snapshot_lock_status(snapshot_action, snapshot_id_list):
    results = []
    # Validate snapshot action
    print(f"Validating provided snapshot action: {snapshot_action}")
    if snapshot_action not in ["LOCK", "UNLOCK"]:
        results.append(["ALL", snapshot_action, "Failure", "Invalid snapshot action specified. Must be LOCK or UNLOCK."])
    else:
        # Loop through and process snapshots
        act = snapshot_action.lower()
        for snapshot_id in snapshot_id_list:

            # Initialize
            print(f"Updating snapshot lock status for snapshot: {snapshot_id}.")
            error_str = ""
            api_client = utils.refresh_tdr_api_client()
            snapshots_api = data_repo_client.SnapshotsApi(api_client=api_client)

            # Update snapshot lock status
            if act == "lock":
                # Lock snapshot
                try:
                    response = snapshots_api.lock_snapshot(id=snapshot_id)
                    results.append([snapshot_id, snapshot_action, "Success", None])
                except Exception as e: 
                    error_str = f"Error updating snapshot lock status: {str(e)}"
                    print(error_str)
                    results.append([snapshot_id, snapshot_action, "Failure", error_str])
            else:
                # Fetch exclusive lock from snapshot (if there is one)
                try:
                    snapshot_detail = snapshots_api.retrieve_snapshot(id=snapshot_id).to_dict()
                    lock_name = snapshot_detail["resource_locks"].get("exclusive")
                    if lock_name:
                        # Unlock snapshot (if locked)
                        try:
                            request_body = {"lockName": lock_name, "forceUnlock": False}
                            response = snapshots_api.unlock_snapshot(id=snapshot_id, unlock_resource_request=request_body)
                            results.append([snapshot_id, snapshot_action, "Success", None])
                        except Exception as e: 
                            error_str = f"Error updating snapshot lock status: {str(e)}"
                            print(error_str)
                            results.append([snapshot_id, snapshot_action, "Failure", error_str])
                    else:
                        results.append([snapshot_id, snapshot_action, "Success", "No existing lock found on snapshot."])
                except Exception as e:
                    error_str = f"Error retrieving lock on snapshot: {str(e)}"
                    results.append([snapshot_id, snapshot_action, "Failure", error_str])

    # Display results
    print("\nResults:")
    results_df = pd.DataFrame(results, columns = ["snapshot", "action", "status", "errors"])
    display(results_df)

#############################################
## Input Parameters
#############################################

# Specify the action to apply to the snapshots (LOCK/UNLOCK):
snapshot_action = "UNLOCK"

# Specify the list of snapshots to apply the action to:
snapshot_id_list = [
    "c3e5c093-3156-4b4c-be3a-2c307c3d8b23"
]

#############################################
## Execution
#############################################

update_snapshot_lock_status(snapshot_action, snapshot_id_list)

## TDR Dataset and/or Snapshot Deletion

In [None]:
# Function to delete a specific TDR Snapshot
def delete_snapshot(snapshot_id):
    api_client = utils.refresh_tdr_api_client()
    snapshots_api = data_repo_client.SnapshotsApi(api_client=api_client)
    print("Attempting to delete snapshot = {}".format(snapshot_id))
    try:
        delete_snapshot_result, job_id = utils.wait_for_tdr_job(snapshots_api.delete_snapshot(id=snapshot_id))
        print("Result: {}".format(delete_snapshot_result))
    except Exception as e:
        print("Error: {}".format(e))

# Function to delete a specific TDR Dataset
def delete_dataset(dataset_id):
    api_client = utils.refresh_tdr_api_client()
    datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
    print("Attempting to delete dataset = {}".format(dataset_id))
    try:
        delete_dataset_result, job_id = utils.wait_for_tdr_job(datasets_api.delete_dataset(id=dataset_id))
        print("Result: {}".format(delete_dataset_result))
    except Exception as e:
        print("Error: {}".format(e))

# Function to delete a specific TDR Dataset and all of its Snapshots
def delete_dataset_and_all_snapshots(dataset_id):
    api_client = utils.refresh_tdr_api_client()
    datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
    snapshots_api = data_repo_client.SnapshotsApi(api_client=api_client)
    print("Attempting to delete dataset = {} and all associated snapshots".format(dataset_id))
    dataset_id_list = [dataset_id]
    # Delete snapshots
    snapshot_list = snapshots_api.enumerate_snapshots(dataset_ids=dataset_id_list)
    if snapshot_list.items:
        for snapshot in snapshot_list.items:
            snapshot_id = str(snapshot.id)
            delete_snapshot(snapshot_id)
            sleep(10)
    # Delete dataset
    delete_dataset(dataset_id)

# Delete snapshots
snapshot_id_list = [
    'c42f8bf8-9aba-44c7-98f4-84b86c66e967',
    '0c1e99d3-8dec-4b42-908b-2ce0c6f8b01b',
    '8c8ca5fb-751e-4398-836c-9e80e5e998be',
    'b1aa512a-5c3d-46ac-bc8d-72480f63c627',
    '24cdba87-1592-40d0-b9e5-904539799a4a',
    'c9037419-367e-439c-a247-b0dae7c24146',
    '61e98f2e-7469-4d19-a6b5-05adec0c25cf',
    '172844bd-976a-44c2-8612-fa0d2981bab6',
    '1bb208f2-ecf3-4589-a9bd-b6e94178584d',
    '5773565d-ad7c-4f51-8b4f-f1ee5dffc08a',
    '2e5c5fe3-3af4-4c34-a85e-af6b4135f089',
    '8e73f31d-403a-458a-a1d2-c9048c24310b',
    '6dcadf4a-71db-498f-87be-3b6bcec912e5',
    '0254cc08-1474-4b3c-ae99-f7d853042dc8',
    'f1c03eab-24bc-4b3a-8aa9-d6696dfaaf31',
    '27068295-b3c0-4260-9447-9ca96814d46f',
    '0d2b8eff-199a-40d5-9b93-c4ab6fa5a7f4',
    'fd57042b-4676-49ba-9d2e-161c83e0f3bf',
    'a39fc400-2146-4949-9a94-fd3d4f1b182c',
    '060c707a-2f0d-4730-bbd6-d25489abfcf6',
    'a3b18d45-96c2-4526-8fde-65ab3265868f',
    '6b8b2cc4-be14-443e-bda5-eed5fe0ffb2e',
    'ddf7ee7d-3234-4f8d-a1e4-305588cd1009',
    'cc8cc17d-1ae4-4303-abd2-4728a676e5a2',
    '613ec6f4-dafb-4689-b109-4573ddca5853',
    'e3823ffe-3070-47b2-a0fc-7c0138e6c61a',
    'ab1d91ba-6aa6-4a40-8c10-2a979cfb29ab',
    '53beb758-f11b-4b3d-8680-14ed3819b78e',
    '0a2ecbaa-02fb-41a6-8f3a-9155dc1e8033',
    '5e1eeafe-a65b-4ee0-8246-e17f2b761423',
    'ae101395-36eb-4d59-9970-6696b82057db',
    '60aa48c3-d4d2-45a5-bd68-dc0ae49c6300',
    'd431721f-060a-4b9a-b4d5-0d19fbf6ae0e',
    '4f6a3c60-28f1-4b2d-86d9-a89b23ddbea5',
    '696423cf-80a7-4c79-8050-63ffa023862b',
    'e95d4773-7a36-4031-ba31-920856187300',
    '44fc0f37-37f6-4f37-b649-cdcc19b2e97e',
    '0c9b5acf-975b-43f6-b23a-2d8a389e2e15',
    '3984cfaf-0034-4b7e-ae21-8ae9810a62a1',
    '1e4c8909-4e94-47b0-a506-ec8eabd004ad',
    '5eaf7f6c-f1a0-4c2c-9838-dc73088b5e0a',
    '42d2dc1b-9ffd-41ad-84b9-b92ed984470f',
    'e22fb210-d913-49ff-a515-8a572cd45f5e',
    'cdc8e909-6dd8-424c-b351-95db1fd45ccf',
    'f6424345-87ab-44fc-85c9-63606675088f',
    '728f209a-ef9c-4303-a93f-a7958dc40f0c',
    '0ffa30ef-91b1-4908-b148-58191f64c97d',
    'c8c3bb66-e4bd-456f-9d38-e82816118807',
    'f0b3a0a5-b857-4903-9e05-c5ac77bb7e79',
    '2d5d9ec7-b748-4ae2-806f-f4bd687c36da',
    'e203302a-91f5-45b4-b64a-095f9a5917ee',
    'f7dbd618-138e-4e0d-907c-19e7f29aa53d',
    'c67cc28d-48e3-4336-8c41-323924e44b0f',
    '2f4d740e-08cb-4b21-a82e-35090463c651',
    'f08a2b91-8838-4d0a-9492-f56ea25e84f9',
    '407bebb2-cfa4-4306-b087-744e93b3b3ea',
    'd64162b6-80f5-4ddb-8e5c-6fbdbb16ac1d',
    '51865c0a-9548-4fea-a6e5-c8754a0bb085',
    '4fe793f8-96bd-4ece-a8b2-1e4fb6712b99',
    '9a17587a-ae6b-481a-8d88-f479981c767f',
    'd97135a2-ed36-4c3f-919f-89d217e01ced',
    'fa6c5fa8-f513-4101-ab7d-a39ba841bdfa',
    '9807332e-22e3-41de-bc41-a9944ba364fc',
    '59fadb0b-eff4-4d6a-9454-d740b523ec10',
    '0d85a6bc-fa74-4933-8537-61d4792159ee',
    'a1825442-cc05-42a3-92aa-f66b1d348f63',
    'e6ccc82e-e11a-4276-a403-cf93a6e5c40c',
    'a0408818-ad55-42fa-a1b9-84537a4b3eed',
    '24845acb-9a94-4641-9216-5ec60a575ce5',
    'c191a23a-926c-4a61-8294-27496a41a4da',
    'bd520cc7-184e-4dad-81dc-f0e0afac19ea',
    '2fb44dc8-06a9-4990-914e-63479c185299',
    '2e8e7c13-3c64-4686-a5fd-0b664bf8510e',
    '356dc4cf-688c-4299-b4a0-9c3d839c1490',
    '6e429241-ea4e-4273-a92e-3d4978b55047',
    '968929f0-e200-4b68-afb2-f0656d5d6bfc',
    '2d46f273-b2e1-4bab-b539-b994454f0582',
    '4602d8f0-a679-4c26-9b83-608d04abab99',
    'a08ee68f-0e5f-4cd3-ab88-b3740ddf709a',
    '8c634fb0-da0e-403c-8e4a-13cef21411a7',
    '9a195856-6f1a-4a01-8799-e922d7c39c35',
    'ef09f8bb-e8d9-4c96-aae4-e57f37ee3fde',
    '08ea5dd3-d095-4527-ae0c-43035da9c207',
    '500080ec-6911-4d78-942c-b0d4c7143894',
    'a83ffd29-e354-4731-947c-746ca6867864',
    '2a8eaabc-68e1-4962-bf1b-332f1b856a78',
    '5f42d478-c8a0-47ea-be75-abea4c1f40c6',
    '802442f2-778f-40e6-ad80-7b55fb60feb5',
    '6cdddd59-711f-4d72-8383-cfa349d58a3d',
    '94ee0523-55b7-4ed3-bd8b-d88f7df91d29',
    '1918b1ee-fa43-49a2-8e5a-d3730c0c20cc',
    '5681d110-8c84-478c-9d1f-7935a54b86ca',
    '6173529b-c677-4fa2-9580-feda9fec3f4f',
    'de5bbe81-816c-4711-b3c8-ce1f9fe1a2ab',
    '236b0da9-a28c-4a0f-a183-d6c1eda0457f',
    '87d02347-d169-4ce0-9027-3c8e11e48c40',
    '61b6ae23-ca19-4d31-bad3-2281a8528886',
    '7c4edc65-bfe6-4ede-a68a-c0b9d2564f29',
    'f330517e-46fd-4de3-8063-015b524a7324',
    'f0d8bb27-1695-4faf-8b27-4b95260b8f17',
    '17d14df1-cb64-4aae-8049-c1728a3c0c81',
    '434f85e2-4435-483c-8099-b03c8ba794ed',
    'c5be02ef-7825-42db-afff-551c88756a6a',
    '5bba97dc-d6ab-4329-912f-148c8b807056',
    '4c722626-c559-4f5a-84bd-8d7d46983e1e',
    '383c10c9-4c8a-49cb-ba4d-f5819fb1a44f',
    '8dea89a6-47f2-4d8c-9a96-dc209c413a09',
    'ddb1bf93-b9c4-41f9-a3cd-2345a78be17c',
    '7b45a50e-e6ae-4295-91c0-382a797c2870',
    'a5f6c2d2-4e9d-472c-8332-8579a2ca299b',
    'b4413e83-d0de-40b5-be47-ed56602f877c',
    'a18587b5-5f28-4820-82bd-a8ee8345e938',
    'cff9e9ae-c9fd-4a06-a461-82367d71cffd',
    '0d607d21-c9c7-4852-83e3-76825176ee0a',
    'd4b02f5f-7a62-4cad-8ffc-d3deb0fab445',
    '7639a9e0-275c-49a8-80c1-cdb01ce23e1c',
    '4c36617f-f95b-431f-96f4-5f27c43e3e2f',
    'e6deaf4c-f75a-4293-8cf8-09ca9c75e13e',
    '59602c19-a798-4417-b84f-f2409d85a964',
    '8a8626b5-f967-48f0-999d-cc7d839a83de',
    '0b64f746-53ad-479f-9db0-923bb6be7423',
    '8c8fdeef-ab29-4787-a02f-de1ec2283201',
    '541aa72b-7771-445c-8abf-6620f54f881b',
    'd7349942-f8ff-4ad6-b075-8f39652a7789',
    'b9e0de2a-4085-4226-a073-1744914cbbd4',
    '36690013-e8bc-43a5-9ba9-83317537557c',
    '172bada7-f1c5-41c4-836d-05381beaed9a',
    '9a1e873b-b1db-4d3e-a83b-ed6c5b3f3ecc',
    '2c6de04e-104d-42c8-8448-97d74985dacb',
    '452bcafd-ab45-4e24-b5e0-13fcf22b0755',
    'fbafdd31-21a0-44c5-ae4d-724839beff61',
    '2a1882d9-88ca-4849-bcc1-f6914f593407',
    '3838993f-59ba-4dec-8110-ac3ea387ab91',
    'bf2f4106-cee9-419c-b4d1-d7b03a6293d5',
    'a6c36f5e-b86c-4164-85ae-8bf0df2e4a90',
    '11a7572f-02b9-4f88-8c2c-802dfb1f94b7',
    '5e547934-c339-410e-a013-dfefed50f4b8',
    'ffa84feb-ca0e-43d3-a04d-a402a8e24a3b',
    '2be072bd-2153-4050-9358-e4b95297a9bf',
    '5d250589-9ce4-4743-9a9f-4201d07d66b9',
    '7c19d852-e36a-4353-afea-10e501601d9a',
    'fd3843fe-ee5d-4784-b0d2-6673f9886d30',
    '84703c54-a9dd-400c-9701-2fc40922e3e3',
    '00297802-e20a-413f-b389-a6f764b6600e',
    'c853d4c0-d4be-433d-964e-e30bdc35480e',
    '3e85b06a-a6ea-4ce8-a655-44b1fce12138',
    'c3c1a841-53e5-404e-9f22-0438c7d1c215',
    '9efae3c7-904c-48a8-939a-e82b46005ae1',
    '6e674477-522f-4adc-8c50-76910a6a282b',
    '504089f1-c59d-48fe-84ef-858bd3eb3043',
    '1cf943bc-9ffe-4fd0-a92d-6fdcf68da743',
    'bb11d621-e471-4ca9-b9ae-cf06c99db297',
    '7b875b4b-a6c5-4c92-a252-cd5ff203089e',
    '97b3d565-3c32-4fd5-be49-c16f0bae84e7',
    '5b8c745a-972b-455c-8021-ee24fdbce9a5',
    'bebf0200-8458-4467-b001-ff436564e942',
    '1c16f983-c090-457a-aca7-4181d16e225b',
    'b259ac6c-3358-4faa-abfe-c9d614b76915',
    '1a119cfe-3178-4f06-800b-b2aec50218b8',
    '42fd7b4a-461d-4a4f-bb02-856e7124dce1',
    '189a0802-8538-41f8-ad51-8bb2a736783b',
    '606fd5a6-6d07-4701-b8f4-1dfb3c7d32c7',
    'e0dc36c3-ff48-4ab5-881f-899578e08dd4',
    '9052b5fc-8ac8-41ea-8a82-6860b8d2c33d',
    'b8bc131f-68d6-4c56-bd37-55c1b0e27d2e',
]
for snapshot_id in snapshot_id_list:
    delete_snapshot(snapshot_id)

# # Delete datasets and all their associated snapshots
# dataset_id_list = [
# 'a43075e3-9abd-4518-bc17-ff162d60cbde',
# '7d546f72-1688-417b-8af2-2f9c98034cd4',
# 'ed82e510-37aa-47f6-88f0-b2ba33e0fdb0',
# '77d3754a-6e43-432f-afa8-c8a24c77faab',
# 'dea7d0d6-e27a-4447-b06f-1136c6bab6e3',
# ]
# for dataset_id in dataset_id_list:
#     delete_dataset_and_all_snapshots(dataset_id)

## Clean Up Outdated AnVIL TDR Service Accounts

In [None]:
valid_sa_list = [
]

# Establish credentials
creds, project = google.auth.default()
auth_req = google.auth.transport.requests.Request()
creds.refresh(auth_req)

# Get current anvil_tdr_ingest membership
group = "anvil_tdr_ingest"
group_members = requests.get(
    url=f"https://api.firecloud.org/api/groups/{group}",
    headers={"Authorization": f"Bearer {creds.token}"}
).json()

# Loop through anvil_tdr_ingest membership and remove outdated users
user_cnt = 0
success_cnt = 0
for member in group_members["membersEmails"]:
    if "tdr-ingest-sa" in member and member not in valid_sa_list:
        user_cnt += 1
        response = requests.delete(
            url=f"https://api.firecloud.org/api/groups/{group}/member/{member}",
            headers={"Authorization": f"Bearer {creds.token}"}
        )
        if response.status_code == 204:
            success_cnt += 1
print(f"Group ({group}) clean-up: ")
print(f"\t- Users to remove: {user_cnt}")
print(f"\t- Users removed successfully: {success_cnt}")

# Get current workspace membership
ws_members = requests.get(
    url=f"https://api.firecloud.org/api/workspaces/{ws_project}/{ws_name}/acl",
    headers={"Authorization": f"Bearer {creds.token}"}
).json()

# Loop through workspace membership and remove outdated users
user_cnt = 0
success_cnt = 0
for member in ws_members["acl"].keys():
    if "tdr-ingest-sa" in member and member not in valid_sa_list:
        user_cnt += 1
        payload = [{
            "email": member,
            "accessLevel": "NO ACCESS",
            "canShare": False,
            "canCompute": False
        }]
        response = requests.patch(
            url=f"https://api.firecloud.org/api/workspaces/{ws_project}/{ws_name}/acl",
            headers={"Authorization": f"Bearer {creds.token}"}, 
            json=payload
        )
        if response.status_code == 200:
            success_cnt += 1
print(f"Workspace ({ws_project}/{ws_name}) clean-up: ")
print(f"\t- Users to remove: {user_cnt}")
print(f"\t- Users removed successfully: {success_cnt}")



## Other Misc

In [None]:
!gsutil -u anvil-datastorage ls gs://fc-secure-33cad843-3453-42ea-bf50-0eda2b52171d