In [None]:
# Version History
#print("Version 1.0.0: 09/23/2022 5:06pm - Nate Calvanese - First version created")
#print("Version 1.0.1: 09/26/2022 11:18m - Nate Calvanese - Fixed bug in default dataset naming")
#print("Version 1.0.2: 09/27/2022 2:43pm - Nate Calvanese - Added ability to aggregate multiple workspaces into one dataset")
#print("Version 1.0.3: 10/5/2022 1:32pm - Nate Calvanese - Added support for chunking up ingest requests")
#print("Version 1.0.4: 10/6/2022 10:35am - Nate Calvanese - Updated use of TDR utility functions")
#print("Version 1.0.5: 10/13/2022 10:54am - Nate Calvanese - Parameter tweaks for latest changes")
#print("Version 1.0.6: 10/21/2022 10:53am - Nate Calvanese - Version stamp for latest changes to supporting notebooks")
#print("Version 1.0.7: 10/24/2022 4:58pm - Nate Calvanese - Added support for project entity name derivation")
#print("Version 1.0.8: 10/26/2022 4:24pm - Nate Calvanese - Added support for batching mapping activities in section 3")
#print('Version 1.0.9: 2/21/2023 2:50pm - Nate Calvanese - Added support for $BQ_DATASET substitution variable in mapping section')
#print('Version 1.0.10: 3/8/2023 8:17am - Nate Calvanese - Performance improvements')
#print('Version 1.0.11: 7/11/2023 8:17am - Nate Calvanese - Added auth domain back as reader on snapshots')
#print('Version 1.0.12: 9/1/2023 10:16am - Nate Calvanese - Added functionality to enable/disable secure monitoring for public datasets.')
#print('Version 1.0.13: 12/15/2023 9:00am - Nate Calvanese - Added functionality to optionally truncate tables before ingest')
#print('Version 1.0.14: 1/12/2024 11:28am - Nate Calvanese - Added max_combined_rec_ref_size as a global parameter')
#print('Version 1.0.15: 2/5/2025 3:10pm - Nate Calvanese - Updated input parameters for T pipeline')
#print('Version 1.0.16: 2/20/2025 11:46am - Nate Calvanese - Updated input parameters for mapping section')
#print('Version 1.0.17: 3/4/2025 10:10am - Nate Calvanese - Updated input parameters for transformation section')
#print('Version 1.1.0 5/7/2025 3:14pm - Nate Calvanese - Refactored notebook to support running in dev as well as prod.')
#print('Version 1.2.0 7/30/2025 1:49pm - Nate Calvanese - Updated section 3.2 to limit certain relationships to a target mapping')
print('Version 1.2.0 7/30/2025 9:16pm - Nate Calvanese - Added functionality for comparing row counts between objects')


# Imports and Common Variables

In [None]:
# Install additional modules (one time effort per cloud environment)
!pip install --upgrade pip import_ipynb data_repo_client urllib3 xmltodict azure-storage-blob
# !pip install data_repo_client==1.409.0

In [None]:
# # Copy latest version of the pipeline notebooks to the cloud environment (uncomment if any notebooks have changed since last run)
# print("\nCopying latest pipeline notebooks to the cloud environment:")
# !gsutil -m cp $ws_bucket/notebooks/*.ipynb .

# # Manually copy in single notebook
# import os
# ws_bucket = os.environ["WORKSPACE_BUCKET"]
# !gsutil cp $ws_bucket/notebooks/ingest_pipeline_utilities.ipynb .
# # !gsutil cp $ws_bucket/notebooks/build_mapping_query.ipynb .

In [None]:
# Workspace environment variables
import os
import re
print("Recording workspace environment variables:")
ws_name = os.environ["WORKSPACE_NAME"]
ws_project = os.environ["WORKSPACE_NAMESPACE"]
ws_bucket = os.environ["WORKSPACE_BUCKET"]
ws_bucket_name = re.sub('^gs://', '', ws_bucket)
print(f"Workspace name = {ws_name}")
print(f"Workspace project = {ws_project}")
print(f"Workspace bucket = {ws_bucket}")
print(f"Workspace bucket name = {ws_bucket_name}")

# Copy latest version of the pipeline notebooks to the cloud environment (uncomment if any notebooks have changed since last run)
# print("\nCopying latest pipeline notebooks to the cloud environment:")
# !gsutil -m cp $ws_bucket/notebooks/*.ipynb .

# Additional imports
print("\nRunning imports:")
import import_ipynb
import pandas as pd
import numpy as np
from firecloud import api as fapi
import data_repo_client
import ingest_pipeline_utilities as utils
import build_mapping_query as bmq
from google.cloud import storage
from google.cloud import bigquery
import google.auth
import google.auth.transport.requests
import logging
import datetime
import json
import sys
from time import sleep
import requests
from io import BytesIO
import pyarrow.parquet as pq
from azure.storage.blob import BlobClient, ContainerClient

# Common pipeline variables (AnVIL)
params = {}
params["run_env"] = "prod" # 'prod' or 'dev'
if params["run_env"] == "prod":
    params["profile_id"] = "e0e03e48-5b96-45ec-baa4-8cc1ebf74c61" 
    params["terra_url"] = "https://api.firecloud.org"
    params["tdr_url"] = "https://data.terra.bio"
else:
    params["profile_id"] = "ab050a35-e597-4c81-9d24-331a49e86016" 
    params["terra_url"] = "https://firecloud-orchestration.dsde-dev.broadinstitute.org"
    params["tdr_url"] = "https://jade.datarepo-dev.broadinstitute.org"
ws_attributes = utils.get_workspace_attributes(ws_project, ws_name, params["terra_url"])
params["ws_name"] = ws_name
params["ws_project"] = ws_project
params["ws_bucket"] = ws_bucket
params["ws_bucket_name"] = ws_bucket_name
params["google_project"] = ws_attributes["googleProject"]
params["create_file_table"] = True
params["file_table_name"] = "file_inventory"
#params["ingest_user_to_add"] = "tdr_sa"  # tdr_sa or anvil_tdr_ingest
params["global_file_exclusions"] = ["SubsetHailJointCall", ".vds/", "ingest_ignore"]
params["max_combined_rec_ref_size"] = 40000

# Configure logging format
while logging.root.handlers:
    logging.root.removeHandler(logging.root.handlers[-1])
logging.basicConfig(format="%(asctime)s - %(levelname)s: %(message)s", datefmt="%m/%d/%Y %I:%M:%S %p", level=logging.INFO, handlers=[logging.StreamHandler(sys.stdout)])
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option("display.max_colwidth", None)
pd.set_option('display.width', 1000)
pd.set_option('display.colheader_justify', 'center')
pd.set_option('display.precision', 3)


# "EL" Pipeline: Load Dataset to TDR in Source Format

## Pipeline Run Variables

In [None]:
## >>> Run Variables <<<
# For datasets split across multiple workspaces, set the staging area and target TDR dataset to the 
# same value to collect all of the source data and process it together.
workspace_run_list = [
    #["Workspace_Name", "Workspace_Project", Public (True/False), "Staging Area (Leave empty for default)", "Target_TDR_Dataset_Name (Leave empty for default)", Run (True/False)]
    ['AnVIL_CCDG_Broad_NP_Epilepsy_USAMON_GRU_GSA-MD', 'anvil-datastorage', False, '', 'ANVIL_CCDG_Broad_NP_Epilepsy_USAMON_GRU_GSA_MD_20250730', True],
#     ['AnVIL_CCDG_WashU_CVD_EOCAD_METSIM_WGS', 'anvil-datastorage', False, '', 'ANVIL_CCDG_WashU_CVD_EOCAD_METSIM_WGS_20250730', True],
]
params["skip_source_files_creation"] = True
params["skip_file_inventory_creation"] = True
params["skip_table_data_processing"] = True
params["skip_ingests"] = False
params["trunc_before_ingest"] = True
params["skip_snapshot_creation"] = True
params["snapshot_readers_list"] = ["auth-domain"] # Include "auth-domain" to add the auth domain(s) as a reader (if one exists)


## >>> File Inventory Variables <<<
# The GCS bucket associated with the source workspace will be automatically included in the file inventory build. To specify 
# additional GCS buckets to include in the file inventory build, add entries to the below dictionary.
params["additional_file_inventory_sources"] = {}
# EXAMPLE:
# params["additional_file_inventory_sources"] = {
#     "staging_area": {
#         "bucket_name": {
#             "include_dirs": [], # Leave empty to include all directories in bucket
#             "exclude_dirs": [] # Exclusions will take precedence over inclusions
#         }
#     }
# }


## >>> Ingest Variables <<<
# For cases where you only want to ingest a subset of files, use the below dictionary to specify exactly what should be ingested.
params["ingest_list_override"] = {
    "sample": []
}
# EXAMPLE:
# params["ingest_list_override"] = {
#     "ws_table": ["ws_table_0.json"], # Leave empty to run ingest for every file for target table
# }


## >>> File Reference Variables <<<
# Fields containing GCS links will be identified automatically by the pipeline. The below dict should contain any fields
# that contain file references that aren't proper GCS links in the workspace tables.
data_file_refs_dict = {}
# Definitions:
#    Required Fields: column, method, mode, create_new field
#    Optional Fields: match_multiple_files (default to True), match_regex (default to None), match_type (default to 'partial'), new_field_name (default to None)
#    Methods: 
#       file_path_match -- Field contains a full or partial file path, which can be matched to the file inventory to grab the file(s) referenced 
#       tdr_file_id -- Field contains file UUIDs of files already ingested into the target TDR dataset
#    Modes:
#       fileref_in_line -- Populates the field with a file reference object
#       fileref_table_ref -- Populates the field with an ID that joins to a file table. If no file table built, falls back on fileref_in_line logic.
    
#-----------------------------------------------------------------------------------------------------------#
    
# Print variables
print("Pipeline run variables set:")
print("Profile ID: " + params["profile_id"])
print("Ingests to run: ")
current_datetime = datetime.datetime.now()
current_date_string = current_datetime.strftime("%Y%m%d")
for workspace in workspace_run_list:
    if workspace[5] == True:
        ws_attributes = utils.get_workspace_attributes(workspace[1], workspace[0], params["terra_url"])
        params["phs_id"] = utils.format_phs_id(ws_attributes["attributes"]["phs_id"]) if ws_attributes["attributes"].get("phs_id") else ""
        auth_list = ws_attributes["authorizationDomain"] if ws_attributes.get("authorizationDomain") else []
        params["auth_domains"] = [x["membersGroupName"] for x in auth_list]
        params["consent_name"] = ws_attributes["attributes"]["library:dataUseRestriction"] if ws_attributes["attributes"].get("library:dataUseRestriction") else ""
        if not params["consent_name"]:
            ws_tags = ws_attributes["attributes"].get("tag:tags")
            if ws_tags:
                for ws_tag in ws_tags:
                    if "consent_code:" in ws_tag:
                        params["consent_name"] = ws_tag.replace("consent_code:", "").strip()
                        break
        params["data_files_src_bucket"] = ws_attributes["bucketName"] if ws_attributes.get("bucketName") else ""
        params["public_dataset"] = workspace[2]
        workspace[4] = workspace[4] if workspace[4] else utils.format_dataset_name(workspace[0])
        workspace[3] = workspace[3] if workspace[3] else workspace[0]
        print("- Workspace [" + workspace[1] + "/" + workspace[0] + "] to TDR dataset [" + workspace[4] + "] via Staging Area [" + workspace[3] + "]")
        print("\t- PHS ID = " + params["phs_id"])
        print("\t- Consent Short Name = " + params["consent_name"])
        print("\t- Auth Domains = " + str(params["auth_domains"]))
        print("\t- Public Dataset = " + str(params["public_dataset"]))
        print("\t- Data Files Source Bucket = " + params["data_files_src_bucket"])
print("Skip source files creation? " + str(params["skip_source_files_creation"]))
print("Skip file inventory creation? " + str(params["skip_file_inventory_creation"]))
print("Skip table data processing? " + str(params["skip_table_data_processing"]))
print("Skip ingests? " + str(params["skip_ingests"]))
print("Truncate tables before ingest? " + str(params["trunc_before_ingest"]))
print("Ingest override list: " + str(params["ingest_list_override"]))
print("Skip snapshot creation? " + str(params["skip_snapshot_creation"]))


## Pipeline Execution

In [None]:
# Loop through and execute workspace connector pipeline ("E") for listed workspaces
!mkdir -p pipeline_results
if params["skip_source_files_creation"] == True:
    logging.info("Skipping source file creation, per user request.")
else:
    current_datetime_string = datetime.datetime.now().strftime("%Y%m%d%H%M")
    e_output_file = "pipeline_results/e_pipeline_results_" + current_datetime_string + ".json"
    logging.info(f"E Pipeline Results File: {e_output_file}")
    params["e_pipeline_results"] = []
    for workspace in workspace_run_list:
        if workspace[5] == True:
            params["data_file_refs"] = data_file_refs_dict  
            utils.run_ws_connector_pipeline(workspace, params)
            params["e_pipeline_results"].extend(params["pipeline_results"])
            if params["e_pipeline_results"]:
                with open(e_output_file, "w") as e_out:
                    json.dump(params["e_pipeline_results"], e_out)

# Aggregate staging area to target dataset combinations, loop through them, and execute ingest pipeline ("L")
!mkdir -p pipeline_results
pipeline_run_list = []
for workspace in workspace_run_list:
    if workspace[5] == True:
        temp_list = [workspace[3], workspace[4], workspace[2]]
        if temp_list not in pipeline_run_list:
            pipeline_run_list.append(temp_list)
current_datetime_string = datetime.datetime.now().strftime("%Y%m%d%H%M")
l_output_file = "pipeline_results/l_pipeline_results_" + current_datetime_string + ".json"
logging.info(f"L Pipeline Results File: {l_output_file}")
params["l_pipeline_results"] = []
for pipeline in pipeline_run_list:
    utils.run_el_pipeline(pipeline, params)
    params["l_pipeline_results"].extend(params["pipeline_results"])
    if params["l_pipeline_results"]:
        with open(l_output_file, "w") as l_out:
            json.dump(params["l_pipeline_results"], l_out)


# Mapping Development
Work through the following steps for each dataset that needs to be processed through the transformation pipeline in Step 4, specifying the target schema ("mapping target") and mapping specification ("mapping_target_spec") you would like to use for transformation. You can also specify the PHS ID ("phs") and consent group ("consent") for the dataset in order to pass that information into the transformations, to cover cases where it is not provided by the data submitters in their data tables. Note that you can use the logs or results_dict from the previous step to retrieve the dataset_id values of interest, or retrieve them directly from TDR via the UI or Swagger.

## Dataset Mapping Variables

In [None]:
## >>> Mapping Variables <<<
# For each dataset specified, include an appropriate mapping target and mapping target specification
datasets_to_map_list = [
    #["dataset_id", "mapping_target", "mapping_target_spec", "phs", "consent", Run (True/False)]
#     ['1e84fc06-90e2-4b76-bae1-81b92822e761', 'anvil', 'gafk_1', 'phs002206', 'DS-PEDD-IRB', True],
#     ['110c5620-4e55-4738-a490-b45098e52bb0', 'anvil', 'igvf_1', 'phs003472', 'HMB-MDS', True],
    ['7f8e26ba-b3ff-4751-b17d-44560b22922f', 'anvil', 'gregor_1', 'phs003047', 'GRU', True],
    ['6b2574fe-523c-4d71-ab66-d336a79cc9e0', 'anvil', 'anvil_1', 'phs003018', 'NRES', True],
]

#-----------------------------------------------------------------------------------------------------------#
    
# Print variables
print("Datasets to map: ")
api_client = utils.refresh_tdr_api_client(params["tdr_url"])
api_client.client_side_validation = False
datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
final_datasets_to_map_dict = {}
skip_dataset_list_access = []
skip_dataset_list_mapping = []
storage_client = storage.Client()
bucket = storage_client.get_bucket(ws_bucket_name)
for dataset in datasets_to_map_list:
    if dataset[5]:
        dataset_id = dataset[0]
        mapping_target = dataset[1]
        mapping_target_spec = dataset[2]
        mapping_phs = dataset[3]
        mapping_consent = dataset[4]
        try:
            dataset_info = datasets_api.retrieve_dataset(id=dataset_id, include=["SCHEMA", "ACCESS_INFORMATION", "PROPERTIES"]).to_dict()
            dataset_name = dataset_info["name"]
        except:
            dataset_name = ""
            skip_dataset_list_access.append(dataset_id)
        try:
            blob = bucket.blob("ingest_pipeline/mapping/{}/mapping_schema_object.json".format(mapping_target))
            content = json.loads(blob.download_as_string(client=None))
            blob = bucket.blob("ingest_pipeline/mapping/{}/{}/mapping_specification.json".format(mapping_target, mapping_target_spec))
            content = json.loads(blob.download_as_string(client=None))
        except:
            skip_dataset_list_mapping.append(dataset_id)
        if dataset_id not in skip_dataset_list_access and dataset_id not in skip_dataset_list_mapping:
            final_datasets_to_map_dict[dataset_id] = {}
            final_datasets_to_map_dict[dataset_id]["mapping_target"] = mapping_target 
            final_datasets_to_map_dict[dataset_id]["mapping_target_spec"] = mapping_target_spec
            final_datasets_to_map_dict[dataset_id]["mapping_phs"] = mapping_phs 
            final_datasets_to_map_dict[dataset_id]["mapping_consent"] = mapping_consent
            print(f"\t - {dataset_name} ({dataset_id}) with {mapping_target}/{mapping_target_spec}")
if skip_dataset_list_access:
    print("Datasets to skip due to non-existence or inaccessibility to the current user:")
    print("\t- " + "\n\t- ".join(skip_dataset_list_access))
if skip_dataset_list_mapping:
    print("Datasets to skip due to invalid mapping target or mapping target specification:")
    print("\t- " + "\n\t- ".join(skip_dataset_list_mapping))   


## Add Missing Relationships to TDR Dataset Schema
Relationships are needed by the mapping query constructor to build appropriate joins between tables. If no joins are required between tables, this step is unnecessary. 

In [None]:
final_datasets_to_map_dict

In [None]:
# Record relationships to potentially add to the source datasets. Note that there may be more relationships to add
# than those listed below, so add to this list as necessary.
potential_relationships = {
    "ALL_DATASETS": [
        ["subject.family_id", "family.family_id"],
        ["sample.subject_id", "subject.subject_id"],
        ["sample.t_01_subject_id", "subject.subject_id"],
        ["sequencing.sample_id", "sample.sample_id"],
        ["sequencing.sample", "sample.sample_id"],
        ["sequencing.sample_alias", "sample.sample_id"],
        ["sample.participant", "participant.participant_id"],
        ["sample.participant_id", "participant.participant_id"],
        ["discovery.sample_id", "sample.sample_id"],
        ["discovery.subject_id", "subject.subject_id"],
        ["qc_result_sample.qc_result_sample_id", "sample.sample_id"],
        ["interval.chromosome", "chromosome.chromosome_id"],
        ["analyte.participant_id", "participant.participant_id"],
        ["participant.family_id", "family.family_id"],
        ["phenotype.participant_id", "participant.participant_id"],
        ["biosample.donor_id", "donor.donor_id"]
    ],
    "gregor": [
        # GREGoR RNA Short Read
        ["experiment_rna_short_read.analyte_id", "analyte.analyte_id"],
        ["aligned_rna_short_read.experiment_rna_short_read_id", "experiment_rna_short_read.experiment_rna_short_read_id"],
        ["aligned_rna_short_read_set.aligned_rna_short_reads", "aligned_rna_short_read.aligned_rna_short_read_id"],
        ["called_variants_rna_short_read.aligned_rna_short_read_set_id", "aligned_rna_short_read_set.aligned_rna_short_read_set_id"],
        # GREGoR DNA Short Read
        ["experiment_dna_short_read.analyte_id", "analyte.analyte_id"],
        ["aligned_dna_short_read.experiment_dna_short_read_id", "experiment_dna_short_read.experiment_dna_short_read_id"],
        ["aligned_dna_short_read_set.aligned_dna_short_reads", "aligned_dna_short_read.aligned_dna_short_read_id"],
        ["called_variants_dna_short_read.aligned_dna_short_read_set_id", "aligned_dna_short_read_set.aligned_dna_short_read_set_id"],
        # GREGoR Nanopore
        ["experiment_nanopore.analyte_id", "analyte.analyte_id"],
        ["aligned_nanopore.experiment_nanopore_id", "experiment_nanopore.experiment_nanopore_id"],
        ["aligned_nanopore_set.aligned_nanopores", "aligned_nanopore.aligned_nanopore_id"],
        ["called_variants_nanopore.aligned_nanopore_set_id", "aligned_nanopore_set.aligned_nanopore_set_id"],
        # GREGoR PacBio
        ["experiment_pac_bio.analyte_id", "analyte.analyte_id"],
        ["aligned_pac_bio.experiment_pac_bio_id", "experiment_pac_bio.experiment_pac_bio_id"],
        ["aligned_pac_bio_set.aligned_pac_bios", "aligned_pac_bio.aligned_pac_bio_id"],
        ["called_variants_pac_bio.aligned_pac_bio_set_id", "aligned_pac_bio_set.aligned_pac_bio_set_id"],
        # GREGoR ATAC Short Read
        ["experiment_atac_short_read.analyte_id", "analyte.analyte_id"],
        ["aligned_atac_short_read.experiment_atac_short_read_id", "experiment_atac_short_read.experiment_atac_short_read_id"],
        ["aligned_atac_short_read_set.aligned_atac_short_reads", "aligned_atac_short_read.aligned_atac_short_read_id"],
        ["called_variants_atac_short_read.aligned_atac_short_read_set_id", "aligned_atac_short_read_set.aligned_atac_short_read_set_id"],
    ]
}

# Loop through datasets and process potential relationship additions
results = []
api_client = utils.refresh_tdr_api_client(params["tdr_url"])
datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
for dataset_id, dataset_details in final_datasets_to_map_dict.items():
    print("Processing potential relationships for dataset_id = {}".format(dataset_id))
    mapping_target_spec = dataset_details["mapping_target_spec"]
    
    # Retrieve source schema
    src_schema_dict = {}
    try:
        datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
        response = datasets_api.retrieve_dataset(id=dataset_id, include=["SCHEMA", "ACCESS_INFORMATION"]).to_dict()
        src_schema_dict["tables"] = response["schema"]["tables"]
        src_schema_dict["relationships"] = response["schema"]["relationships"]
    except Exception as e:
        print("Error retrieving source schema from TDR. Error: {}".format(e))
        results.append([dataset_id, "Error"])

    # Loop through potential relationships and add those present for the source dataset
    additional_relationships = []
    for rel_set, relationships in potential_relationships.items():
        if rel_set == "ALL_DATASETS" or rel_set in mapping_target_spec:
            for rel in relationships:
                from_table = rel[0].split(".")[0] 
                from_column = rel[0].split(".")[1]
                to_table = rel[1].split(".")[0]
                to_column = rel[1].split(".")[1]
                if bmq.confirm_column_exists(src_schema_dict, from_table, from_column) and bmq.confirm_column_exists(src_schema_dict, to_table, to_column):
                    relationship_found = False
                    for rel_entry in src_schema_dict["relationships"]:
                        if rel_entry["_from"]["table"] == from_table and rel_entry["_from"]["column"] == from_column and rel_entry["to"]["table"] == to_table and rel_entry["to"]["column"] == to_column:
                            relationship_found = True
                        elif rel_entry["_from"]["table"] == to_table and rel_entry["_from"]["column"] == to_column and rel_entry["to"]["table"] == from_table and rel_entry["to"]["column"] == from_column:
                            relationship_found = True
                    if not relationship_found:
                        rel_dict = {
                            "name": from_table + "_" + from_column + "__to__" + to_table + "_" + to_column,
                            "from": {"table": from_table, "column": from_column},
                            "to": {"table": to_table, "column": to_column}
                        }
                        additional_relationships.append(rel_dict)

    # Submit the schema update request for the TDR dataset
    if additional_relationships:
        schema_update_request = {
            "description": "Adding relationships to support query construction.",
            "changes": {
                "addRelationships": additional_relationships
            }
        }
        try:
            resp = utils.wait_for_tdr_job(datasets_api.update_schema(id=dataset_id, dataset_schema_update_model=schema_update_request), params["run_env"])
            print("Schema update successful: " + str(resp)[0:1000])
            results.append([dataset_id, "Success"])
        except Exception as e:
            print("Error running schema update: " + str(e))
            results.append([dataset_id, "Error"])
    else:
        print("No additional relationships to add to schema.")
        results.append([dataset_id, "Success"])

print("Processing of potential relationships for specified datasets complete.")
print("\nResults:")
results_df = pd.DataFrame(results, columns = ["dataset", "status"])
display(results_df)


## Retrieve Mapping Artifacts and Run Query Construction
Retrieve the artifacts you would like to use to construct transformation queries for your datasets, based on the previously specified target schema and mapping specification. These transformation queries will then be dynamically constructed based on the appropriate target schema, mapping specification, and source schema. 

In [None]:
# Loop through datasets and process transformation query construction
api_client = utils.refresh_tdr_api_client(params["tdr_url"])
datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
results = []
for dataset_id in final_datasets_to_map_dict:
    print("Building transformation queries for dataset_id = {}".format(dataset_id))

    # Collect mapping variables
    mapping_target = final_datasets_to_map_dict[dataset_id]["mapping_target"]
    mapping_target_spec = final_datasets_to_map_dict[dataset_id]["mapping_target_spec"]
    mapping_phs = final_datasets_to_map_dict[dataset_id]["mapping_phs"]
    mapping_consent = final_datasets_to_map_dict[dataset_id]["mapping_consent"]
    
    # Retrieve source schema
    src_schema_dict = {}
    try:
        datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
        response = datasets_api.retrieve_dataset(id=dataset_id, include=["SCHEMA", "ACCESS_INFORMATION"]).to_dict()
        src_schema_dict["name"] = response["name"]
        src_schema_dict["tables"] = response["schema"]["tables"]
        src_schema_dict["relationships"] = response["schema"]["relationships"]
        bq_project = response["access_information"]["big_query"]["project_id"]
        bq_schema = response["access_information"]["big_query"]["dataset_name"]
        phs_id = response["phs_id"]
    except Exception as e:
        print("Error retrieving source schema from TDR. Error: {}".format(e))

    # Set dataset name and project name parameters to substitute into transform queries
    dataset_name_value = re.sub("(_[0-9]+$)", "", src_schema_dict["name"])
    project_name_value = re.sub("'", "", utils.derive_project_name(dataset_id, mapping_phs, dataset_name_value, params))
    phs_id_value = mapping_phs
    consent_group_value = mapping_consent

    # Retrieve target schema and mapping specification
    target_schema_dict = {}
    mapping_spec = {}
    storage_client = storage.Client()
    bucket = storage_client.get_bucket(ws_bucket_name)
    try:
        blob = bucket.blob(f"ingest_pipeline/mapping/{mapping_target}/mapping_schema_object.json")
        target_schema_dict = json.loads(blob.download_as_string(client=None))
    except Exception as e:
        print("Error retrieving target schema for specified mapping_target. Error: {}".format(e))
    try:
        blob = bucket.blob(f"ingest_pipeline/mapping/{mapping_target}/{mapping_target_spec}/mapping_specification.json")
        blob_string = blob.download_as_text(client=None)
        blob_string = blob_string.replace("$DATASET_NAME", dataset_name_value)
        blob_string = blob_string.replace("$PROJECT_NAME", project_name_value)
        blob_string = blob_string.replace("$PROJECT_PHS", phs_id_value)
        blob_string = blob_string.replace("$DATASET_CONSENT", consent_group_value)
        blob_string = blob_string.replace("$BQ_DATASET", bq_project + "." + bq_schema)
        mapping_spec = json.loads(blob_string)
    except Exception as e:
        print("Error retrieving mapping specification for specified mapping_target and mapping_target_spec. Error: {}".format(e))

    # Update aliases in mapping specification
    mapping_spec = bmq.update_mapping_spec_aliases(mapping_spec, src_schema_dict)
    
    # Build queries from mapping specification
    query_dict = {}
    if target_schema_dict:
        for target_table in target_schema_dict["tables"]:
            table_name = target_table["name"]
            missing_artifacts = False
            if src_schema_dict and mapping_spec:
                query_dict[table_name] = bmq.build_mapping_query(target_table, src_schema_dict, mapping_spec, bq_project, bq_schema)
            else:
                missing_artifacts = True
                query_dict[table_name] = {"query": "", "syntax_check": ""} 
        if missing_artifacts == True:
            print("Source schema dictionary and/or mapping specification missing. Unable to generate queries.")
            results.append([dataset_id, "Error"])
    else:
        print("Target schema dictionary missing. Unable to generate queries.")
        results.append([dataset_id, "Error"])
    
    # Evaluate queries -- Publish if no issues found, otherwise convert to dataframe and display
    failure_count = 0
    for key, val in query_dict.items():
        if val["syntax_check"] != "Passed" and val["syntax_check"] != None:
            failure_count += 1
    if failure_count == 0:
        print("No failures found in query construction, publishing to the cloud.")
        results.append([dataset_id, "Success"])
        # Copy target schema file to output folder for mapping target
        source_path = "ingest_pipeline/mapping/{}/mapping_schema_object.json".format(mapping_target)
        destination_path = "ingest_pipeline/output/transformed/{}/{}/schema/mapping_schema_object.json".format(mapping_target, dataset_id)
        !gsutil cp $ws_bucket/$source_path $ws_bucket/$destination_path 2> stdout

        # Limit query dict to valid queries, write out, and copy to output folder for mapping target
        valid_query_dict = {}
        for target, val in query_dict.items():
            if val["syntax_check"] == "Passed":
                valid_query_dict[target] = val
        final_query_dict = {
            "dataset_id": dataset_id,
            "transforms": valid_query_dict
        }
        query_dict_json = json.dumps(final_query_dict)
        query_output_file = "transform_query_set.json"
        with open(query_output_file, 'w') as outfile:
            outfile.write(query_dict_json)
        destination_path = "ingest_pipeline/output/transformed/{}/{}/queries".format(mapping_target, dataset_id)
        !gsutil cp $query_output_file $ws_bucket/$destination_path/ 2> stdout
    else:
        print("Failures found in query construction, must be resolved before publishing.")
        print("Query building results:")
        results.append([dataset_id, "Error"])
        query_df = pd.DataFrame.from_dict(query_dict, orient="index")
        query_df.index.name = "target_table"
        query_df.reset_index(inplace=True)
        display(query_df)

print("Transformation query construction and processing complete.")
print("\nResults:")
results_df = pd.DataFrame(results, columns = ["dataset", "status"])
display(results_df)


## Evaluate Vocabulary Mapping
For target attributes leveraging the "VOCAB_MAP" transformation, evaluate whether the source values have a record in the dsp-data-ingest.transform_resources.vocab_map table. If additional mappings are needed, these should be put into place before the transformation queries are executed.

In [None]:
# Set display parameter
show_only_missing_maps = True

# Loop through datasets and process vocabulary mapping evaluation
api_client = utils.refresh_tdr_api_client(params["tdr_url"])
datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
for dataset_id in final_datasets_to_map_dict:
    print("Evaluating vocabulary mapping for dataset_id = {}".format(dataset_id))

    # Collect mapping variables
    mapping_target = final_datasets_to_map_dict[dataset_id]["mapping_target"]
    mapping_target_spec = final_datasets_to_map_dict[dataset_id]["mapping_target_spec"]
    mapping_phs = final_datasets_to_map_dict[dataset_id]["mapping_phs"]
    mapping_consent = final_datasets_to_map_dict[dataset_id]["mapping_consent"]
    
    # Retrieve source schema
    src_schema_dict = {}
    try:
        datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
        response = datasets_api.retrieve_dataset(id=dataset_id, include=["SCHEMA", "ACCESS_INFORMATION"]).to_dict()
        src_schema_dict["name"] = response["name"]
        src_schema_dict["tables"] = response["schema"]["tables"]
        src_schema_dict["relationships"] = response["schema"]["relationships"]
        bq_project = response["access_information"]["big_query"]["project_id"]
        bq_schema = response["access_information"]["big_query"]["dataset_name"]
        phs_id = response["phs_id"]
    except Exception as e:
        print("Error retrieving source schema from TDR. Error: {}".format(e))

    # Set dataset name and project name parameters to substitute into transform queries
    dataset_name_value = re.sub("(_[0-9]+$)", "", src_schema_dict["name"])
    project_name_value = utils.derive_project_name(dataset_id, phs_id, dataset_name_value, params)
    phs_id_value = mapping_phs
    consent_group_value = mapping_consent

    # Retrieve target schema and mapping specification
    target_schema_dict = {}
    mapping_spec = {}
    storage_client = storage.Client()
    bucket = storage_client.get_bucket(ws_bucket_name)
    try:
        blob = bucket.blob(f"ingest_pipeline/mapping/{mapping_target}/mapping_schema_object.json")
        target_schema_dict = json.loads(blob.download_as_string(client=None))
    except Exception as e:
        print("Error retrieving target schema for specified mapping_target. Error: {}".format(e))
    try:
        blob = bucket.blob(f"ingest_pipeline/mapping/{mapping_target}/{mapping_target_spec}/mapping_specification.json")
        blob_string = blob.download_as_text(client=None)
        blob_string = blob_string.replace("$DATASET_NAME", dataset_name_value)
        blob_string = blob_string.replace("$PROJECT_NAME", project_name_value)
        blob_string = blob_string.replace("$PROJECT_PHS", phs_id_value)
        blob_string = blob_string.replace("$DATASET_CONSENT", consent_group_value)
        blob_string = blob_string.replace("$BQ_DATASET", bq_project + "." + bq_schema)
        mapping_spec = json.loads(blob_string)
    except Exception as e:
        print("Error retrieving mapping specification for specified mapping_target and mapping_target_spec. Error: {}".format(e))

    # Evaluate vocab mapping and display results
    df = bmq.evaluate_vocab_mapping(mapping_spec, src_schema_dict, target_schema_dict, bq_project, bq_schema)
    print("-------------------------------------------")
    print("Missing mapped_value view:")
    print("-------------------------------------------")
    display(df[df["mapped_value"].isnull() & df["source_value"].notnull()])
    if not show_only_missing_maps:
        print("\n-------------------------------------------")
        print("Full view:")
        print("-------------------------------------------")
        display(df)
    
print("Vocabulary mapping evaluation and processing complete.")


## [Optional] Update/Override Generated Queries as Necessary
Review any queries that have not passed the syntax check, as these need to be remedied before they can be published and executed. Any other queries that do not align with expectations can be overridden by either A) Updating the mapping target specification and re-running the previous step, or B) Manually overriding the query below. Option B should only be used in one-off cases.

### Build Base Query Dictionary

In [None]:
# Input the appropriate dataset and mapping target specification
dataset_id = "f1e1ef01-d52d-423e-a65b-3a1d26c7ee9d"
mapping_target = "anvil"
mapping_target_spec = "cmg_ext_2"

# Retrieve source schema
src_schema_dict = {}
api_client = utils.refresh_tdr_api_client()
datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
try:
    datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
    response = datasets_api.retrieve_dataset(id=dataset_id, include=["SCHEMA", "ACCESS_INFORMATION"]).to_dict()
    src_schema_dict["name"] = response["name"]
    src_schema_dict["tables"] = response["schema"]["tables"]
    src_schema_dict["relationships"] = response["schema"]["relationships"]
    bq_project = response["access_information"]["big_query"]["project_id"]
    bq_schema = response["access_information"]["big_query"]["dataset_name"]
    phs_id = response["phs_id"]
except Exception as e:
    print("Error retrieving source schema from TDR. Error: {}".format(e))

# Set dataset name and project name parameters to substitute into transform queries
dataset_name_value = re.sub("(_[0-9]+$)", "", src_schema_dict["name"])
project_name_value = utils.derive_project_name(dataset_id, phs_id, dataset_name_value)

# Retrieve target schema and mapping specification
target_schema_dict = {}
mapping_spec = {}
storage_client = storage.Client()
bucket = storage_client.get_bucket(ws_bucket_name)
try:
    blob = bucket.blob(f"ingest_pipeline/mapping/{mapping_target}/mapping_schema_object.json")
    target_schema_dict = json.loads(blob.download_as_string(client=None))
except Exception as e:
    print("Error retrieving target schema for specified mapping_target. Error: {}".format(e))
try:
    blob = bucket.blob(f"ingest_pipeline/mapping/{mapping_target}/{mapping_target_spec}/mapping_specification.json")
    blob_string = blob.download_as_text(client=None)
    blob_string = blob_string.replace("$DATASET_NAME", dataset_name_value)
    blob_string = blob_string.replace("$PROJECT_NAME", project_name_value)
    mapping_spec = json.loads(blob_string)
except Exception as e:
    print("Error retrieving mapping specification for specified mapping_target and mapping_target_spec. Error: {}".format(e))

# Build queries from mapping specification
query_dict = {}
if target_schema_dict:
    for target_table in target_schema_dict["tables"]:
        table_name = target_table["name"]
        missing_artifacts = False
        if src_schema_dict and mapping_spec:
            query_dict[table_name] = bmq.build_mapping_query(target_table, src_schema_dict, mapping_spec, bq_project, bq_schema)
        else:
            missing_artifacts = True
            query_dict[table_name] = {"query": "", "syntax_check": ""} 
    if missing_artifacts == True:
        print("Source schema dictionary and/or mapping specification missing. Unable to generate queries.")
else:
    print("Target schema dictionary missing. Unable to generate queries.")
    
# Display query dictionary
query_df = pd.DataFrame.from_dict(query_dict, orient="index")
query_df.index.name = "target_table"
query_df.reset_index(inplace=True)
display(query_df)
    


### Update Query Dict as Necessary

In [None]:
# To update the query definition for particular target table, input the target table and query below
target_table = "anvil_donor"
query = "SELECT 1"

# Run syntax check
query_dict[target_table]["query"] = query
query_dict[target_table]["syntax_check"] = bmq.run_syntax_check(query)
print(query_dict[target_table])


### Publish Updated Query Dict

In [None]:
# Copy target schema file to output folder for mapping target
source_path = "ingest_pipeline/mapping/{}/mapping_schema_object.json".format(mapping_target)
destination_path = "ingest_pipeline/output/transformed/{}/{}/schema/mapping_schema_object.json".format(mapping_target, dataset_id)
!gsutil cp $ws_bucket/$source_path $ws_bucket/$destination_path 2> stdout

# Limit query dict to valid queries, write out, and copy to output folder for mapping target
valid_query_dict = {}
for target, val in query_dict.items():
    if val["syntax_check"] == "Passed":
        valid_query_dict[target] = val
final_query_dict = {
    "dataset_id": dataset_id,
    "transforms": valid_query_dict
}
query_dict_json = json.dumps(final_query_dict)
query_output_file = "transform_query_set.json"
with open(query_output_file, 'w') as outfile:
    outfile.write(query_dict_json)
destination_path = "ingest_pipeline/output/transformed/{}/{}/queries".format(mapping_target, dataset_id)
!gsutil cp $query_output_file $ws_bucket/$destination_path/ 2> stdout

# "T" Pipeline: Load Additional Transformed Tables to TDR

## Pipeline Run Variables

In [None]:
# Run Variables
dataset_id_run_list = [
    #["dataset_id", "auth_domain", "phs", "consent_code", "consent", "dataset_ticket", Run (True/False)],   
    ['b8da0ffa-8ff8-439a-be42-80353a43c5c5', 'AUTH_AnVIL_CCDG_Broad_NP_Epilepsy_AUSALF_HMB_IRB_WES', 'phs001489', 'c27', 'HMB-IRB-MDS', 'ANVIL-218', True],
    ['13f2c58b-70c2-4dfe-9a24-b3d05572ad8f', 'AUTH_AnVIL_CCDG_Broad_NP_Epilepsy_AUSAUS_EP_BA_CN_ID_MDS_WES', 'phs001489', 'c3', 'DS-EPSBACID-MDS-RD', 'ANVIL-220', True],
    ['ad3758aa-1acb-42a8-b956-9c6355ddb1ac', 'AUTH_AnVIL_CCDG_Broad_NP_Epilepsy_AUSAUS_EPI_BA_ID_MDS_WES', 'phs001489', 'c1', 'DS-EPSBAID-MDS-RD', 'ANVIL-222', True],
    ['a1782b2d-57f6-4e61-839d-2a659e576343', 'AUTH_AnVIL_CCDG_Broad_NP_Epilepsy_AUSAUS_EPIL_BA_MDS_WES', 'phs001489', 'c2', 'DS-EPSBA-MDS-RD', 'ANVIL-224', True],
    ['6f543971-3165-4ab4-944b-412cdb2dcb95', 'AUTH_AnVIL_CCDG_Broad_NP_Epilepsy_AUSRMB_DS_EAED_IRB_WES', 'phs001489', 'c33', 'DS-EAED-IRB-NPU-MDS', 'ANVIL-225', True],
    ['68c04e98-2db0-4a1d-a4c8-fd087d1f8836', 'AUTH_AnVIL_CCDG_Broad_NP_Epilepsy_AUSRMB_DS_EAED_MDS_WES', 'phs001489', 'c22', 'DS-EAED-MDS', 'ANVIL-228', True],
    ['65d29659-803e-44e8-9392-b89d5d7b13f6', 'AUTH_AnVIL_CCDG_Broad_NP_Epilepsy_AUTMUV_DS_NS_ADLT_WES', 'phs001489', 'c30', 'DS-NSD-ADULTS-NPU-MDS', 'ANVIL-229', True],
    ['bb403e99-d70b-4296-83dd-f63148a3a3f5', 'AUTH_AnVIL_CCDG_Broad_NP_Epilepsy_AUTMUV_DS_NS_WES', 'phs001489', 'c29', 'DS-NSD-NPU-MDS', 'ANVIL-232', True],
    ['0a90128d-a601-4adb-b7dd-e28e6c2e3983', 'AUTH_AnVIL_CCDG_Broad_NP_Epilepsy_BELATW_GRU_WES', 'phs001489', 'c13', 'GRU', 'ANVIL-234', True],
    ['3125a5df-f5d6-4355-8a64-ef67d9ef6768', 'AUTH_AnVIL_CCDG_Broad_NP_Epilepsy_BELULB_DS_EP_NPU_WES', 'phs001489', 'c17', 'DS-EP-NPU', 'ANVIL-236', True],
    ['74758e76-8bfb-4775-8b34-5362df49b55b', 'AUTH_AnVIL_CCDG_Broad_NP_Epilepsy_BRAUSP_DS_WES', 'phs001489', 'c32', 'DS-MBND-NPU-MDS', 'ANVIL-238', True],
    ['04de9328-da6a-42fd-b4a1-1c9dbc99d256', 'AUTH_AnVIL_CCDG_Broad_NP_Epilepsy_CANCAL_GRU_v2_WES', 'phs001489', 'c13', 'GRU', 'ANVIL-239', True],
    ['cc39ad3d-9884-4ecb-8633-9444f613741c', 'AUTH_AnVIL_CCDG_Broad_NP_Epilepsy_CANUTN_DS_EP_WES', 'phs001489', 'c10', 'DS-EP', 'ANVIL-242', True],
    ['8b27e095-d690-4c40-b1de-5197ba15234c', 'AUTH_AnVIL_CCDG_Broad_NP_Epilepsy_CHEUBB_HMB_IRB_MDS_WES', 'phs001489', 'c27', 'HMB-IRB-MDS', 'ANVIL-244', True],
    ['92942434-e960-4b2e-95f8-b04b816b44d4', 'AUTH_AnVIL_CCDG_Broad_NP_Epilepsy_CYPCYP_HMB_NPU_MDS_WES', 'phs001489', 'c8', 'HMB-NPU-MDS', 'ANVIL-246', True],
    ['de213a72-4128-46ef-a321-55f85bc6c7ed', 'AUTH_AnVIL_CCDG_Broad_NP_Epilepsy_CZEMTH_GRU_WES', 'phs001489', 'c13', 'GRU', 'ANVIL-248', True],
    ['58e05ce6-4e1d-4c5f-a6a5-cca94d457739', 'AUTH_AnVIL_CCDG_Broad_NP_Epilepsy_DEUPUM_HMB_MDS_WES_Year3', 'phs001489', 'c11', 'HMB-MDS', 'ANVIL-250', True],
    ['f63fdec8-154e-43ee-965c-4ccea37ef852', 'AUTH_AnVIL_CCDG_Broad_NP_Epilepsy_DEUUGS_DS_EP_MDS_WES', 'phs001489', 'c16', 'DS-EP-MDS', 'ANVIL-252', True],
    ['11fb953c-1580-4f96-ab48-249e6e96f9a7', 'AUTH_AnVIL_CCDG_Broad_NP_Epilepsy_DEUUKB_HMB_NPU_MDS_WES', 'phs001489', 'c8', 'HMB-NPU-MDS', 'ANVIL-254', True],
    ['3db6d9cb-923f-44fe-a8d3-1c861541b39d', 'AUTH_AnVIL_CCDG_Broad_NP_Epilepsy_DEUUKL_HMB_WES', 'phs001489', 'c19', 'HMB', 'ANVIL-256', True],
    ['8b371868-d4ed-4fed-ad10-bde377107592', 'AUTH_AnVIL_CCDG_Broad_NP_Epilepsy_DEUULG_GRU_WES', 'phs001489', 'c13', 'GRU', 'ANVIL-258', True],
    ['4f6027b3-9f3a-4a17-ba2d-58204cc6ea3f', 'AUTH_AnVIL_CCDG_Broad_NP_Epilepsy_DEUUTB_HMB_NPU_MDS_WES', 'phs001489', 'c8', 'HMB-NPU-MDS', 'ANVIL-261', True],
    ['c6217265-252c-46a6-9b92-286442ddc83c', 'AUTH_AnVIL_CCDG_Broad_NP_Epilepsy_FINKPH_EPIL_MDS_WES', 'phs001489', 'c4', 'DS-EPCOM-MDS-RD', 'ANVIL-262', True],
    ['92ed1a10-4b14-46c3-b3f2-8f3f0d44f62f', 'AUTH_AnVIL_CCDG_Broad_NP_Epilepsy_FINUVH_HMB_NPU_MDS_WES', 'phs001489', 'c8', 'HMB-NPU-MDS', 'ANVIL-265', True],
    ['e05d8c1e-10d6-4ab4-8033-f25471cee13e', 'AUTH_AnVIL_CCDG_Broad_NP_Epilepsy_FRALYU_HMB_WES', 'phs001489', 'c11', 'HMB-MDS', 'ANVIL-267', True],
    ['0a41abbf-7b9b-4671-b2fd-a61392a179ec', 'AUTH_AnVIL_CCDG_Broad_NP_Epilepsy_GBRSWU_CARDI_NEURO_WES', 'phs001489', 'c14', 'DS-CARNEU-MDS', 'ANVIL-269', True],
    ['8e966c04-a747-47a1-9160-7da1b07031d9', 'AUTH_AnVIL_CCDG_Broad_NP_Epilepsy_GBRUCL_DS_EARET_MDS_WES', 'phs001489', 'c24', 'DS-EARET-MDS', 'ANVIL-271', True],
    ['5fd356c2-cbae-453f-a585-82e802417041', 'AUTH_AnVIL_CCDG_Broad_NP_Epilepsy_GBRUNL_EP_ETIOLOGY_MDS_WES', 'phs001489', 'c9', 'DS-EPASM-MDS-RD', 'ANVIL-273', True],
    ['7df4b9aa-c2d5-4df7-8381-126328e159df', 'AUTH_AnVIL_CCDG_Broad_NP_Epilepsy_GBRUNL_GRU_WES', 'phs001489', 'c13', 'GRU', 'ANVIL-275', True],
    ['44deae8b-ce4b-4246-b3e6-69a70c867cc7', 'AUTH_AnVIL_CCDG_Broad_NP_Epilepsy_GHAKNT_GRU_WES', 'phs001489', 'c13', 'GRU', 'ANVIL-277', True],
    ['4ff73d9e-8286-4fe0-b813-7f2821a49884', 'AUTH_AnVIL_CCDG_Broad_NP_Epilepsy_HKGHKK_HMB_MDS_WES', 'phs001489', 'c11', 'HMB-MDS', 'ANVIL-279', True],
    ['3e25160d-37e2-4ede-b405-5c30a0c2567a', 'AUTH_AnVIL_CCDG_Broad_NP_Epilepsy_HKOSB_GRU_WES', 'phs001489', 'c13', 'GRU', 'ANVIL-281', True],
    ['4200aab1-089c-4d1a-a010-cef4354a1d2f', 'AUTH_AnVIL_CCDG_Broad_NP_Epilepsy_HRVUZG_HMB_MDS_WES', 'phs001489', 'c11', 'HMB-MDS', 'ANVIL-283', True],
    ['c7989cca-b3c9-4361-be78-f71162dfee7e', 'AUTH_AnVIL_CCDG_Broad_NP_Epilepsy_IRLRCI_GRU_IRB_WES', 'phs001489', 'c12', 'GRU-IRB', 'ANVIL-285', True],
    ['d3d5c207-5ec4-40de-8a26-416d5faaba5d', 'AUTH_AnVIL_CCDG_Broad_NP_Epilepsy_ITAICB_HMB_NPU_MDS_WES', 'phs001489', 'c8', 'HMB-NPU-MDS', 'ANVIL-287', True],
    ['186c2956-36d8-4318-9b7a-f0303ad97694', 'AUTH_AnVIL_CCDG_Broad_NP_Epilepsy_ITAIGI_GRU_WES', 'phs001489', 'c13', 'GRU', 'ANVIL-289', True],
    ['e0f56709-eb6c-4e7f-bbff-5fe18fb89f61', 'AUTH_AnVIL_CCDG_Broad_NP_Epilepsy_ITAUBG_DS_EPI_NPU_MDS_WES', 'phs001489', 'c20', 'DS-EPI-ADULT-NPU-MDS', 'ANVIL-291', True],
    ['f2206d9b-2582-4204-b81a-bb833f9d8e2f', 'AUTH_AnVIL_CCDG_Broad_NP_Epilepsy_ITAUMC_DS_NEURO_MDS_WES', 'phs001489', 'c23', 'DS-NEUROLOGY-MDS', 'ANVIL-293', True],
    ['44d0ff4d-f5ad-491b-a617-f8872e3a1431', 'AUTH_AnVIL_CCDG_Broad_NP_Epilepsy_ITAUMR_GRU_NPU_WES', 'phs001489', 'c21', 'GRU-NPU', 'ANVIL-295', True],
    ['4846c93f-46a0-4bf2-a34d-ba2b55513036', 'AUTH_AnVIL_CCDG_Broad_NP_Epilepsy_JPNFKA_GRU_WES', 'phs001489', 'c13', 'GRU', 'ANVIL-297', True],
    ['44203821-dbad-41f2-bac1-983da35842a4', 'AUTH_AnVIL_CCDG_Broad_NP_Epilepsy_JPNRKI_DS_NPD_IRB_NPU_WES', 'phs001489', 'c25', 'DS-NPD-IRB-NPU', 'ANVIL-299', True],
    ['9fad8665-e36b-4efe-a8da-19d897f89b1c', 'AUTH_AnVIL_CCDG_Broad_NP_Epilepsy_KENKIL_GRU_WES', 'phs001489', 'c13', 'GRU', 'ANVIL-301', True],
    ['cbba277b-d6d5-4ab6-89e4-ffc58f6656e6', 'AUTH_AnVIL_CCDG_Broad_NP_Epilepsy_LEBABM_DS_Epilepsy_WES', 'phs001489', 'c16', 'DS-EP-MDS', 'ANVIL-303', True],
    ['2d344396-765b-4864-9def-52e1632288e9', 'AUTH_AnVIL_CCDG_Broad_NP_Epilepsy_LEBABM_GRU_WES', 'phs001489', 'c13', 'GRU', 'ANVIL-305', True],
    ['0e9c0430-9d99-4e91-84ef-a9b34e068de9', 'AUTH_AnVIL_CCDG_Broad_NP_Epilepsy_LTUUHK_HMB_NPU_MDS_WES', 'phs001489', 'c8', 'HMB-NPU-MDS', 'ANVIL-307', True],
    ['a35d5089-fcc4-40b1-991e-30b633c4f7d4', 'AUTH_AnVIL_CCDG_Broad_NP_Epilepsy_NZLUTO_EPIL_BC_ID_MDS_WES', 'phs001489', 'c5', 'DS-EPSBACID-NPU-MDS-RD', 'ANVIL-309', True],
    ['7f67466f-90f7-4dbb-8a2f-6f0201f05e7f', 'AUTH_AnVIL_CCDG_Broad_NP_Epilepsy_TURBZU_GRU_WES', 'phs001489', 'c13', 'GRU', 'ANVIL-311', True],
    ['f8f89ba2-da06-4b1c-a945-ea7c634966cf', 'AUTH_AnVIL_CCDG_Broad_NP_Epilepsy_TURIBU_DS_NEURO_AD_NPU_WES', 'phs001489', 'c26', 'DS-NEUROLOGY-ADULTS-NPU', 'ANVIL-313', True],
    ['f59f4876-bb40-44f5-a24f-d06e7ce0c955', 'AnVIL_CCDG_Broad_NP_Epilepsy_TWNCGM_HMB_NPU_AdultsONLY_WES', 'phs001489', 'c8', 'HMB-NPU-MDS', 'ANVIL-318', True],
    ['09c76765-5f11-49c2-b2ec-dbe8bb6e8a10', 'AUTH_AnVIL_CCDG_Broad_NP_Epilepsy_USABCH_EPI_MUL_CON_MDS_WES', 'phs001489', 'c6', 'DS-EPI-MULTI-MDS', 'ANVIL-320', True],
    ['899134c5-81c2-48e9-b411-c0b85770ab12', 'AUTH_AnVIL_CCDG_Broad_NP_Epilepsy_USABLC_GRU_NPU_WES', 'phs001489', 'c21', 'GRU-NPU', 'ANVIL-322', True],
    ['d6badb13-5719-42e0-b44a-42aeb518f124', 'AUTH_AnVIL_CCDG_Broad_NP_Epilepsy_USACCF_HMB_MDS_WES', 'phs001489', 'c11', 'HMB-MDS', 'ANVIL-324', True],
    ['e0e24a53-3b37-4665-9cba-cfa80e31a3c3', 'AUTH_AnVIL_CCDG_Broad_NP_Epilepsy_USACCH_DS_NEURO_MDS_WES', 'phs001489', 'c23', 'DS-NEUROLOGY-MDS', 'ANVIL-326', True],
    ['eee9db44-de4f-4c3d-803e-f2e751b24d23', 'AUTH_AnVIL_CCDG_Broad_NP_Epilepsy_USACHP_GRU_WES', 'phs001489', 'c13', 'GRU', 'ANVIL-328', True],
    ['f2ac8c6f-3b5d-43a6-9843-176df23bce8c', 'AUTH_AnVIL_CCDG_Broad_NP_Epilepsy_USACRW_DS_EP_MDS_WES', 'phs001489', 'c16', 'DS-EP-MDS', 'ANVIL-330', True],
    ['a1468614-947d-4789-a516-8b64b036e02b', 'AUTH_AnVIL_CCDG_Broad_NP_Epilepsy_USACRW_DS_SEIZD_WES', 'phs001489', 'c15', 'DS-SEIZD', 'ANVIL-332', True],
    ['9c679f21-5457-4161-a6f9-f711e408276f', 'AUTH_AnVIL_CCDG_Broad_NP_Epilepsy_USACRW_EPI_ASZ_MED_MDS_WES', 'phs001489', 'c7', 'DS-EPASM-MDS', 'ANVIL-334', True],
    ['278d687b-6dd0-4e9e-ba1b-a1467f2b00df', 'AUTH_AnVIL_CCDG_Broad_NP_Epilepsy_USAEGP_GRU_WES', 'phs001489', 'c13', 'GRU', 'ANVIL-336', True],
    ['49ed4ad0-eeb9-4377-90c0-42f9235efc1c', 'AUTH_AnVIL_CCDG_Broad_NP_Epilepsy_USAFEB_GRU_WES', 'phs001489', 'c13', 'GRU', 'ANVIL-338', True],
    ['3663ca97-38ef-49ee-b559-f934cd4c4cb6', 'AUTH_AnVIL_CCDG_Broad_NP_Epilepsy_USAHEP_GRU_WES', 'phs001489', 'c13', 'GRU', 'ANVIL-340', True],
    ['e6820957-741d-4c9c-9010-b51a192b321d', 'AUTH_AnVIL_CCDG_Broad_NP_Epilepsy_USALCH_HMB_WES', 'phs001489', 'c11', 'HMB-MDS', 'ANVIL-342', True],
    ['b823a87b-cc65-4547-9a16-9f8c18932806', 'AUTH_AnVIL_CCDG_Broad_NP_Epilepsy_USAMGH_HMB_MDS_WES', 'phs001489', 'c11', 'HMB-MDS', 'ANVIL-344', True],
    ['442e4540-312c-42e3-a8f0-985966340fe5', 'AUTH_AnVIL_CCDG_Broad_NP_Epilepsy_USAMGH_MGBB_HMB_MDS_WES', 'phs001489', 'c11', 'HMB-MDS', 'ANVIL-346', True],
    ['9fa5196b-29ba-4e52-a49e-5b073b79cc72', 'AUTH_AnVIL_CCDG_Broad_NP_Epilepsy_USAMON_GRU_NPU_WES', 'phs001489', 'c21', 'GRU-NPU', 'ANVIL-351', True],
    ['2deee87e-9aef-49dc-8d8f-42294717f2cc', 'AUTH_AnVIL_CCDG_Broad_NP_Epilepsy_USAMON_GRU_WES', 'phs001489', 'c13', 'GRU', 'ANVIL-352', True],
    ['f06949e0-e526-4b33-b00e-2691cc9e694a', 'AUTH_AnVIL_CCDG_Broad_NP_Epilepsy_USAMON_HMB_WES', 'phs001489', 'c8', 'HMB-NPU-MDS', 'ANVIL-354', True],
    ['fcccfacc-05fb-4814-ba7a-bc6d2757e213', 'AUTH_AnVIL_CCDG_Broad_NP_Epilepsy_USAMSS_DS_EP_NEURO_MDS_WES', 'phs001489', 'c16', 'DS-EP-MDS', 'ANVIL-356', True],
    ['b307a1c6-8e72-456b-8669-cabd4851cd72', 'AUTH_AnVIL_CCDG_Broad_NP_Epilepsy_USANCH_DS_NEURO_MDS_WES', 'phs001489', 'c31', 'DS-NEURO-EP-MDS', 'ANVIL-358', True],
    ['41579c21-3a39-429c-b91e-7c5d0eb08f2a', 'AUTH_AnVIL_CCDG_Broad_NP_Epilepsy_USAUPN_Marsh_GRU_NPU_WES', 'phs001489', 'c21', 'GRU-NPU', 'ANVIL-363', True],
    ['b6dccf69-84ed-4b58-ac6e-d681f3163d92', 'AUTH_AnVIL_CCDG_Broad_NP_Epilepsy_USAUPN_Marsh_GRU_WES', 'phs001489', 'c13', 'GRU', 'ANVIL-364', True],
    ['348b5d5b-f819-459b-a419-a33312e0e908', 'AUTH_AnVIL_CCDG_Broad_NP_Epilepsy_USAUPN_Rader_GRU_WES', 'phs001489', 'c13', 'GRU', 'ANVIL-366', True],
    ['cf8faf19-45c5-479d-93d8-1e13b954642a', 'AUTH_AnVIL_CCDG_Broad_NP_Epilepsy_USAVAN_HMB_GSO_WES', 'phs001489', 'c28', 'HMB-GSO', 'ANVIL-368', True],
    ['e69f28e9-73bf-44b6-90eb-40ae566ccb3f', 'AUTH_AnVIL_CCDG_Broad_NP_Epilepsy_USAVANcontrols_HMB_GSO_WES', 'phs001489', 'c28', 'HMB-GSO', 'ANVIL-370', True],
    ['b2f559a5-c853-4bfa-9546-cde5aa380c51', 'AUTH_AnVIL_CCDG_Broad_NP_Epilepsy_ZAFAGN_DS_EPI_COMO_MDS_WES', 'phs001489', 'c16', 'DS-EP-MDS', 'ANVIL-372', True],
    ['52b75821-1d51-49d2-bff2-d79d45f3ed7d', 'AUTH_AnVIL_CCDG_Broad_Spalletta_HMB_NPU_MDS_WES', 'phs001489', 'c8', 'HMB-NPU-MDS', 'ANVIL-373', True],  
]
params["mapping_target"] = "anvil"
params["skip_transforms"] = False
params["transform_list_override"] = [] # Leave empty to run transforms for all files, otherwise populate with target table names 
params["skip_dataset_updates"] = False
params["skip_ingests"] = False
params["trunc_before_ingest"] = True
params["ingest_list_override"] = [] # Leave empty to run ingests for all files, otherwise populate with target table names
params["skip_file_relation_inference"] = False
params["skip_dangling_fk_resolution"] = False
params["skip_supplementary_file_identification"] = False
params["skip_snapshot_creation"] = False
params["snapshot_readers_list"] = ["azul-anvil-prod@firecloud.org", "auth-domain"] # Include "auth-domain" to add the auth domain(s) as a reader (if one exists)
params["skip_data_validation"] = True

#-----------------------------------------------------------------------------------------------------------#

# Print variables
print("Pipeline run variables set:")
print("Profile ID: " + params["profile_id"])
print("Mapping Target: " + params["mapping_target"])
print("Datasets to run: ")
api_client = utils.refresh_tdr_api_client(params["tdr_url"])
datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
dataset_skip_list = []
for dataset in dataset_id_run_list:
    if dataset[6]:
        dataset_id = dataset[0]
        try:
            dataset_info = datasets_api.retrieve_dataset(id=dataset_id, include=["SCHEMA", "ACCESS_INFORMATION", "PROPERTIES"]).to_dict()
            dataset_name = dataset_info["name"]
            phs_id = dataset[2]
            consent_code = dataset[3]
            consent_name = dataset[4]
            auth_domains = [dataset[1]] if dataset[1] else []
            dataset_ticket = dataset[5]
            if dataset_info["properties"]:
                src_workspaces = dataset_info["properties"].get("source_workspaces")
            else:
                src_workspaces = []
        except:
            dataset_name = ""
            dataset_skip_list.append(dataset_id)
        if dataset_name:
            dataset_id = dataset[0]
            print("- " + dataset_name + " ({})".format(dataset_id))
            print("\t- PHS ID = " + str(phs_id))
            print("\t- Consent Short Name = " + str(consent_name))
            print("\t- Consent Code = " + str(consent_code))
            print("\t- Auth Domains = " + str(auth_domains))
            print("\t- Source Workspaces = " + str(src_workspaces))
            print("\t- Dataset Ticket = " + str(dataset_ticket))
if dataset_skip_list:
    print("Datasets to skip (they either don't exist or aren't accessible to the current user): ")
    print("\t- " + "\n\t- ".join(dataset_skip_list)) 
print("Skip transforms? " + str(params["skip_transforms"]))
print("Transforms override list: " + str(params["transform_list_override"]))
print("Skip dataset schema and property updates? " + str(params["skip_dataset_updates"]))
print("Skip ingests? " + str(params["skip_ingests"]))
print("Truncate tables before ingest? " + str(params["trunc_before_ingest"]))
print("Ingest override list: " + str(params["ingest_list_override"]))
print("Skip file relationship inference? " + str(params["skip_file_relation_inference"]))
print("Skip dangling foreign key resolution? " + str(params["skip_dangling_fk_resolution"]))
print("Skip supplementary file identification? " + str(params["skip_supplementary_file_identification"]))
print("Skip snapshot creation? " + str(params["skip_snapshot_creation"]))
print("Skip data validation? " + str(params["skip_data_validation"]))


## Pipeline Execution

In [None]:
# Loop through and execute pipeline for listed workspaces
!mkdir -p pipeline_results
current_datetime_string = datetime.datetime.now().strftime("%Y%m%d%H%M")
t_output_file = "pipeline_results/t_pipeline_results_" + current_datetime_string + ".json"
logging.info(f"T Pipeline Results File: {t_output_file}")
params["t_pipeline_results"] = []
for dataset in dataset_id_run_list:
    if dataset[6]:
        dataset_id = dataset[0]
        try:
            api_client = utils.refresh_tdr_api_client(params["tdr_url"])
            datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
            dataset_info = datasets_api.retrieve_dataset(id=dataset_id, include=["SCHEMA", "ACCESS_INFORMATION", "PROPERTIES"]).to_dict()
            dataset_name = dataset_info["name"]
            phs_id = dataset[2]
            if dataset_info["properties"]:
                consent_code = dataset[3]
                consent_name = dataset[4]
                auth_domains = [dataset[1]] if dataset[1] else []
                src_workspaces = dataset_info["properties"].get("source_workspaces")
                dataset_ticket = dataset[5]
            else:
                consent_code = dataset[3]
                consent_name = dataset[4] 
                auth_domains = [dataset[1]] if dataset[1] else []
                src_workspaces = []
                dataset_ticket = dataset[5]
        except:
            dataset_name = ""
        if dataset_name:
            params["dataset_id"] = dataset_id
            params["dataset_name"] = dataset_name
            params["phs_id"] = phs_id
            params["consent_name"] = consent_name
            params["consent_code"] = consent_code
            params["auth_domains"] = auth_domains
            params["dataset_ticket"] = dataset_ticket
            utils.run_t_pipeline(params)
            params["t_pipeline_results"].extend(params["pipeline_results"])
            if params["t_pipeline_results"]:
                with open(t_output_file, "w") as t_out:
                    json.dump(params["t_pipeline_results"], t_out)
        

# Utility Scripts
Uncomment sections as necessary to accomplish various miscellaneous tasks.

## Monitor or Review Pipeline Progress

In [None]:
!ls pipeline_results
# !rm pipeline_results/*_pipeline_results_* 

In [None]:
# Results file to load
results_file = "pipeline_results/t_pipeline_results_202507300051.json"

# Load results
with open(results_file, "r") as f:
    loaded_results = json.load(f)
if "e_pipeline" in results_file:
    df_results = pd.DataFrame(loaded_results, columns = ["Workspace", "Staging Area", "Time", "Step", "Status", "Message"])
elif "l_pipeline" in results_file:
    df_results = pd.DataFrame(loaded_results, columns = ["Staging Area", "Time", "Step", "Task", "Status", "Message"])
else:
    df_results = pd.DataFrame(loaded_results, columns = ["Dataset", "Time", "Step", "Task", "Status", "Message"])  

In [None]:
## Display all results
#display(df_results)

## Display all failures
#display(df_results[df_results["Status"] != "Success"])

## Display specific L pipeline results
#display(df_results[(df_results["Staging Area"] == "AnVIL_CCDG_Broad_Spalletta_HMB_NPU_MDS_WES") & ((df_results["Task"] == "Create New Dataset") | (df_results["Status"] == "Error"))])

## Display specific T pipeline results
#display(df_results[(df_results["Dataset"].str.contains("b8da0ffa-8ff8-439a-be42-80353a43c5c5", case=False, na=False))])
#display(df_results[(df_results["Dataset"].str.contains("13f2c58b-70c2-4dfe-9a24-b3d05572ad8f", case=False, na=False)) & ((df_results["Task"] == "Create and Share Snapshot") | (df_results["Status"] == "Error"))])
display(df_results[((df_results["Task"] == "Create and Share Snapshot") | (df_results["Status"] == "Error"))])

# Excel functions
# =REGEXEXTRACT(G2, "'id': '(.*)', 'name'",2,0)
# =REGEXEXTRACT(G2, "'name': '(.*)', 'description'",2,0)

## Collect AnVIL Snapshots and Datasets

In [None]:
# Define parameters
tdr_url = "https://data.terra.bio"
sam_url = "https://sam.dsde-prod.broadinstitute.org"
#billing_profile_list = ["e0e03e48-5b96-45ec-baa4-8cc1ebf74c61", "9ee23bed-b46c-4561-9103-d2a723113f7f"]
billing_profile_list = ["e0e03e48-5b96-45ec-baa4-8cc1ebf74c61"]
dataset_id_list = [
]

# Collect Anvil datasets and snapshots
current_datetime_string = datetime.datetime.now().strftime("%Y%m%d %H:%M:%S")
logging.info(f"Start time: {current_datetime_string}")
api_client = utils.refresh_tdr_api_client(tdr_url)
datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
snapshots_api = data_repo_client.SnapshotsApi(api_client=api_client)
datasets_list = datasets_api.enumerate_datasets(limit=5000)
if dataset_id_list:
    dataset_list_len = min(len(datasets_list.items), len(dataset_id_list))
else:
    dataset_list_len = len(datasets_list.items)
records_list = []
dataset_count = 0
for dataset_entry in datasets_list.items:
    if len(dataset_id_list) == 0 or dataset_entry.id in dataset_id_list:
        dataset_count += 1
        logging.info(f"Processing dataset {dataset_count} of {dataset_list_len}")
        api_client = utils.refresh_tdr_api_client(tdr_url)
        datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
        snapshots_api = data_repo_client.SnapshotsApi(api_client=api_client)
        if dataset_entry.default_profile_id in billing_profile_list:
            dataset_detail = datasets_api.retrieve_dataset(id=dataset_entry.id, include=["PROPERTIES", "DATA_PROJECT"])
            snapshots_list = snapshots_api.enumerate_snapshots(dataset_ids=[dataset_entry.id], limit=1000)
            ds_props = dataset_detail.properties
            if ds_props:
                props_src_ws = dataset_detail.properties.get("source_workspaces")
                props_ds_ticket = dataset_detail.properties.get("dataset_ticket")
            else: 
                props_src_ws = ""
                props_ds_ticket = ""
            dataset_identifier = ""
            if props_ds_ticket:
                dataset_identifier = props_ds_ticket
            elif props_src_ws:
                dataset_identifier = ", ".join(props_src_ws)
            if len(snapshots_list.items) == 0:
                record = [None, None, None, None, None, None, None, None, None, dataset_entry.id, dataset_entry.name, dataset_detail.ingest_service_account, dataset_entry.created_date[0:10], dataset_entry.cloud_platform, dataset_entry.secure_monitoring_enabled, dataset_identifier]
                records_list.append(record)
            else:
                snapshot_list_len = len(snapshots_list.items)
                snapshot_count = 0
                for snapshot_entry in snapshots_list.items:
                    snapshot_count += 1
                    logging.info(f"Processing snapshot {snapshot_count} of {snapshot_list_len} for dataset {dataset_count}")
                    # Get public policy information
                    creds, project = google.auth.default()
                    auth_req = google.auth.transport.requests.Request()
                    creds.refresh(auth_req)
                    public_flag = "N"
                    public_response = requests.get(
                        url=f"{sam_url}/api/resources/v2/datasnapshot/{snapshot_entry.id}/policies/reader/public",
                        headers={"Authorization": f"Bearer {creds.token}"},
                    )
                    if public_response.text == "true":
                        public_flag = "Y"
                    # Get snapshot DUOS ID and Lock status
                    api_client = utils.refresh_tdr_api_client(tdr_url)
                    snapshots_api = data_repo_client.SnapshotsApi(api_client=api_client)
                    duos_id = ""
                    lock_status = False
                    try:
                        snapshot_detail = snapshots_api.retrieve_snapshot(id=snapshot_entry.id, include=["DUOS"])
                        if snapshot_detail.duos_firecloud_group:
                            duos_id = snapshot_detail.duos_firecloud_group.duos_id
                        lock_name = snapshot_detail.resource_locks.exclusive
                        if lock_name:
                            lock_status = True
                    except:
                        pass
                    # Get snapshot readers and auth domain
                    try:
                        snapshot_policy_response = snapshots_api.retrieve_snapshot_policies(id=snapshot_entry.id)
                        for role in snapshot_policy_response.policies:
                            if role.name == "reader":
                                readers = ", ".join(role.members)
                        ad_groups = ""
                        if snapshot_policy_response.auth_domain:
                            ad_groups = ", ".join(snapshot_policy_response.auth_domain)
                    except:
                        ad_groups = "ACCESS_MISSING"
                    record = [snapshot_entry.id, snapshot_entry.name, snapshot_entry.created_date[0:10], public_flag, readers, ad_groups, duos_id, snapshot_entry.data_project, lock_status, dataset_entry.id, dataset_entry.name, dataset_detail.ingest_service_account, dataset_entry.created_date[0:10], dataset_entry.cloud_platform, dataset_entry.secure_monitoring_enabled, dataset_identifier]
                    records_list.append(record)    
df = pd.DataFrame(records_list, columns =["Snapshot ID", "Snapshot Name", "Snapshot Created Date", "Snapshot Public", "Snapshot Readers", "Snapshot Auth Domain", "Snapshot DUOS ID", "Snapshot Data Project", "Snapshot Locked", "Source Dataset ID", "Source Dataset Name", "Source Dataset SA", "Source Dataset Created Date", "Cloud Platform", "Secure Monitoring", "Dataset Identifier"])
df_sorted = df.sort_values(["Dataset Identifier", "Source Dataset Name", "Snapshot Name"], ascending=[True, True, True], ignore_index=True)
current_datetime_string = datetime.datetime.now().strftime("%Y%m%d %H:%M:%S")
logging.info(f"End time: {current_datetime_string}")
display(df_sorted)


## Soft Deletion of TDR Dataset Records

In [None]:
# Input parameters
run_env = "prod"
tdr_url = "https://data.terra.bio"
dataset_id_list = [
    '1e84fc06-90e2-4b76-bae1-81b92822e761'
]
table_list = ["file"]
#table_list = ["anvil_activity", "anvil_alignmentactivity", "anvil_antibody", "anvil_assayactivity", "anvil_biosample", "anvil_dataset", "anvil_diagnosis", "anvil_donor", "anvil_file", "anvil_project", "anvil_sequencingactivity", "anvil_variantcallingactivity"]
#table_list = ["file_inventory", "sample", "subject", "workspace_attributes", "sequencing", "qc_result_sample", "family", "chromosome", "interval", "participant", "discovery", "sample_set", "vcf"]
#table_list = ["anvil_dataset", "anvil_project"]
#table_list = ["sequence_file", "specimen_from_organism", "imaging_protocol", "ipsc_induction_protocol", "image_file", "analysis_process", "cell_line", "supplementary_file", "protocol", "cell_suspension", "analysis_protocol", "dissociation_protocol", "project", "sequencing_protocol", "donor_organism", "enrichment_protocol", "organoid", "collection_protocol", "library_preparation_protocol", "analysis_file", "differentiation_protocol", "aggregate_generation_protocol", "process", "links", "reference_file", "imaging_preparation_protocol", "imaged_specimen"] #HCA/Lungmap
delete_all_records = True
delete_record_list = [
] # Will be ignored if delete_all_records is set to True

#--------------------------------------------------------------------------------------------------------

# Function to delete rows from a dataset
def delete_datarepo_rows(dataset_id, table_name, datarepo_row_ids, tdr_url):
    print("Attempting to delete specified rows from {} for dataset {}".format(table_name, dataset_id))
    if datarepo_row_ids:
        data_deletion_payload = {
            "deleteType": "soft",
            "specType": "jsonArray",
            "tables": [{
              "tableName": table_name,
              "jsonArraySpec": {
                "rowIds": datarepo_row_ids
              }
            }]
        }
        try:
            api_client = utils.refresh_tdr_api_client(tdr_url)
            datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
            data_deletion_result, job_id = utils.wait_for_tdr_job(datasets_api.apply_dataset_data_deletion(id=dataset_id, data_deletion_request=data_deletion_payload), run_env)
            print("Result: {}".format(data_deletion_result))
        except Exception as e:
            print("Error: {}".format(str(e)))
    else:
        print("No datarepo_row_ids specified for deletion.")

# Function to collect all datarepo rows for a particular table within a dataset
def collect_all_datarepo_rows(dataset_id, table_name, tdr_url):
    try:
        api_client = utils.refresh_tdr_api_client(tdr_url)
        datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
        response = datasets_api.retrieve_dataset(id=dataset_id, include=["ACCESS_INFORMATION"]).to_dict()
        if response["access_information"]["big_query"]: 
            cloud = "gcp"
            bq_project = response["access_information"]["big_query"]["project_id"]
            bq_schema = response["access_information"]["big_query"]["dataset_name"]
        else:
            cloud = "azure"
            for parquet_table in response["access_information"]["parquet"]["tables"]:
                if parquet_table["name"] == table_name:
                    sas_url = parquet_table["url"] + "?" + parquet_table["sas_token"]
                    break
    except Exception as e:
        print("Error retrieving dataset information: {}".format(str(e)))
    if cloud == "gcp":
        client = bigquery.Client()
        query = "SELECT datarepo_row_id FROM `{project}.{schema}.{table}`".format(project = bq_project, schema = bq_schema, table = table_name)
        try:
            query_job = client.query(query)
            results = [row["datarepo_row_id"] for row in query_job]
            return results
        except Exception as e:
            print("Error retrieving datarepo_row_id list: {}".format(str(e)))
    else:
#         blob_client = BlobClient.from_blob_url(sas_url)
#         downloaded_blob = blob_client.download_blob()
#         bytes_io = BytesIO(downloaded_blob.readall())
#         df_blob = pd.read_parquet(bytes_io)
        retrieval_error = False
        max_page_size = 1000
        records_fetched = 0 
        total_record_count = 1
        results = []
        while records_fetched < total_record_count and not retrieval_error:
            row_start = records_fetched
            attempt_counter = 0
            while True:
                payload = {
                  "offset": row_start,
                  "limit": max_page_size,
                  "sort": "datarepo_row_id",
                  "direction": "asc",
                  "filter": ""
                }
                try:
                    dataset_results = datasets_api.query_dataset_data_by_id(id=dataset_id, table=table_name, query_data_request_model=payload).to_dict() 
                    total_record_count = dataset_results["total_row_count"]
                    for record in dataset_results["result"]:
                        results.append(record["datarepo_row_id"])
                        records_fetched += 1
                    break
                except Exception as e:
                    if attempt_counter < 5:
                        sleep(10)
                        attempt_counter += 1
                        continue
                    else:
                        warn_str = "Error retrieving data_repo_row_ids for table."
                        logging.warning(warn_str)
                        retrieval_error = True
                        break
        return results
    
# Function to loop through datasets and delete
def execute_deletions(dataset_id_list, table_list, delete_all_records, delete_record_list, tdr_url):
    for dataset_id in dataset_id_list:
        print(f"Processing record deletions for dataset {dataset_id}")
        for table in table_list:
            print(f"Processing record deletion for {table}")
            if delete_all_records:
                datarepo_row_ids = collect_all_datarepo_rows(dataset_id, table, tdr_url)
            else:
                datarepo_row_ids = delete_record_list
            if datarepo_row_ids:
                delete_datarepo_rows(dataset_id, table, datarepo_row_ids, tdr_url)
            else:
                print("No records specified for deletion.")
                
#--------------------------------------------------------------------------------------------------------

execute_deletions(dataset_id_list, table_list, delete_all_records, delete_record_list, tdr_url)              



## Compare Record Counts

In [None]:
#############################################
## Functions
#############################################

def fetch_dataset_counts(dataset_id):
    # Setup/refresh TDR clients
    results = []
    api_client = utils.refresh_tdr_api_client("https://data.terra.bio")
    datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
    
    # Get table list from dataset
    table_set = set()
    try:
        dataset_details = datasets_api.retrieve_dataset(id=dataset_id, include=["SCHEMA"]).to_dict()
        for table_entry in dataset_details["schema"]["tables"]:
            table_set.add(table_entry["name"])
    except Exception as e:
        error_str = f"Error retrieving details from dataset: {str(e)}"
        results.append([dataset_id, "All", 0, "Failure", error_str])
        return results
    
    # For each table in the table list, pull record counts from the two datasets and compare
    payload = {
      "offset": 0,
      "limit": 10,
      "sort": "datarepo_row_id",
      "direction": "asc",
      "filter": ""
    }
    for table in table_set:
        attempt_counter = 0
        while True:
            try:
                record_results = datasets_api.query_dataset_data_by_id(id=dataset_id, table=table, query_data_request_model=payload).to_dict()
                record_count = record_results["total_row_count"]
                results.append([dataset_id, table, record_count, "Success", ""])
                break
            except Exception as e:
                if attempt_counter < 2:
                    sleep(10)
                    attempt_counter += 1
                    continue
                else:
                    error_str = f"Error pulling record count: {str(e)}"
                    results.append([dataset_id, table, 0, "Failure", error_str])
                    break
    return results

def fetch_workspace_counts(workspace):
    # Pull workspace table counts
    results = []
    try:    
        entity_types = utils.list_entity_types("anvil-datastorage", workspace, "https://api.firecloud.org")
        entity_json = json.loads(entity_types.text)
        for table, table_details in entity_json.items():
            results.append([workspace, table, table_details["count"], "Success", ""])
        return results
    except Exception as e:
        error_str = f"Error retrieving details from workspace: {str(e)}"
        results.append([workspace, "All", 0, "Failure", error_str])
        return results

def compare_row_counts(object_1, object_2, table_ignore_list):
    # Determine object types
    uuid_pattern = re.compile(r'^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}$')
    object_1_type = "TDR" if uuid_pattern.match(object_1) else "WS"
    object_2_type = "TDR" if uuid_pattern.match(object_2) else "WS"

    # Process object 1
    if object_1_type == "TDR":
        object_1_results = fetch_dataset_counts(object_1)
    else:
        object_1_results = fetch_workspace_counts(object_1)
    df_obj_1_results = pd.DataFrame(object_1_results, columns = ["source_object", "table", "record_count", "status", "message"])

    # Process object 2
    if object_2_type == "TDR":
        object_2_results = fetch_dataset_counts(object_2)
    else:
        object_2_results = fetch_workspace_counts(object_2)
    df_obj_2_results = pd.DataFrame(object_2_results, columns = ["source_object", "table", "record_count", "status", "message"])

    # Compare and return results
    df_merged_results = pd.merge(df_obj_1_results, df_obj_2_results, on="table", how="outer", suffixes=("_1", "_2"))
    df_merged_results["match"] = ((df_merged_results["record_count_1"] == df_merged_results["record_count_2"]) & df_merged_results["record_count_1"].notna() & df_merged_results["record_count_2"].notna())
    conditions = [df_merged_results["record_count_1"].isna(), df_merged_results["record_count_2"].isna(), (df_merged_results["status_1"] != "Success") | (df_merged_results["status_2"] != "Success"), df_merged_results["match"]]
    choices = ["Table only exists in source_object_2", "Table only exists in source_object_1", "Failure in record retrieval for one or more source objects", None]
    df_merged_results["match_detail"] = np.select(conditions, choices, default="Table exists in both source objects but record counts don't match")
    if table_ignore_list:
        df_merged_results = df_merged_results[~df_merged_results["table"].isin(table_ignore_list)]
    df_merged_results = df_merged_results[["table", "source_object_1", "record_count_1", "source_object_2", "record_count_2", "match", "match_detail", "status_1", "message_1", "status_2", "message_2"]].where((pd.notnull(df_merged_results)), None)
    return df_merged_results    
    
#############################################
## Input Parameters
#############################################

# Specify tables to ignore
table_ignore_list = ["file_inventory", "workspace_attributes", "ingestion_reference"]

# Specify the list of dataset pairs to compare
object_pairs_list = [
    #["<dataset_id/workspace_name>", "<dataset_id/workspace_name>"]
    ['AnVIL_CMH_GAFK_R5_Staging', 'fa266e3c-d921-4555-a083-fb3803ae43d2'],
]

#############################################
## Execution
#############################################

# Run validation
df_agg_results = pd.DataFrame()
for object_pair in object_pairs_list:
    df_object_pair_results = compare_row_counts(object_pair[0], object_pair[1], table_ignore_list)
    df_agg_results = pd.concat([df_agg_results, df_object_pair_results], ignore_index=True)

# Display final results
print("Failures Only:")
display(df_agg_results[df_agg_results["match"] == False])

print("Full Results:")
display(df_agg_results)   


## TDR Dataset and/or Snapshot Deletion

In [None]:
# Function to delete a specific TDR Snapshot
def delete_snapshot(snapshot_id, tdr_url):
    api_client = utils.refresh_tdr_api_client(tdr_url)
    snapshots_api = data_repo_client.SnapshotsApi(api_client=api_client)
    print("Attempting to delete snapshot = {}".format(snapshot_id))
    try:
        delete_snapshot_result, job_id = utils.wait_for_tdr_job(snapshots_api.delete_snapshot(id=snapshot_id), run_env)
        print("Result: {}".format(delete_snapshot_result))
    except Exception as e:
        print("Error: {}".format(e))

# Function to delete a specific TDR Dataset
def delete_dataset(dataset_id, tdr_url):
    api_client = utils.refresh_tdr_api_client(tdr_url)
    datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
    print("Attempting to delete dataset = {}".format(dataset_id))
    try:
        delete_dataset_result, job_id = utils.wait_for_tdr_job(datasets_api.delete_dataset(id=dataset_id), run_env)
        print("Result: {}".format(delete_dataset_result))
    except Exception as e:
        print("Error: {}".format(e))

# Function to delete a specific TDR Dataset and all of its Snapshots
def delete_dataset_and_all_snapshots(dataset_id, tdr_url):
    api_client = utils.refresh_tdr_api_client(tdr_url)
    datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
    snapshots_api = data_repo_client.SnapshotsApi(api_client=api_client)
    print("Attempting to delete dataset = {} and all associated snapshots".format(dataset_id))
    dataset_id_list = [dataset_id]
    # Delete snapshots
    snapshot_list = snapshots_api.enumerate_snapshots(dataset_ids=dataset_id_list)
    if snapshot_list.items:
        for snapshot in snapshot_list.items:
            snapshot_id = str(snapshot.id)
            delete_snapshot(snapshot_id)
            sleep(10)
    # Delete dataset
    delete_dataset(dataset_id, tdr_url)

# # Delete snapshots
# run_env = "prod"
# tdr_url = "https://data.terra.bio"
# snapshot_id_list = [
# ]
# for snapshot_id in snapshot_id_list:
#     delete_snapshot(snapshot_id, tdr_url)

# Delete datasets and all their associated snapshots
run_env = "prod"
tdr_url = "https://data.terra.bio"
dataset_id_list = [
    '500b7041-1338-48f3-b52f-82ae92c231f6'
]
for dataset_id in dataset_id_list:
    delete_dataset_and_all_snapshots(dataset_id, tdr_url)

## AnVIL TDR Service Account Management

### Determine the Buckets a TDR Dataset SA Needs Access To

In [None]:
# Define parameters
billing_profile = "e0e03e48-5b96-45ec-baa4-8cc1ebf74c61"
tdr_url = "https://data.terra.bio"

# Establish API client
api_client = utils.refresh_tdr_api_client(tdr_url)
datasets_api = data_repo_client.DatasetsApi(api_client=api_client)

# Loop through enumerated datasets and create records for those related to AnVIL
print(f"Start time: {datetime.datetime.now()}")
records_list = []
datasets_list = datasets_api.enumerate_datasets(limit=5000)
for dataset_entry in datasets_list.items:
    if dataset_entry.default_profile_id == billing_profile:
        # Retrieve dataset details and pull source workspace(s)
        dataset_details = datasets_api.retrieve_dataset(id=dataset_entry.id, include=["ACCESS_INFORMATION", "PROPERTIES"]).to_dict()
        
        # Pull data file size sum from BigQuery
        bq_project = dataset_details["access_information"]["big_query"]["project_id"]
        bq_schema = dataset_details["access_information"]["big_query"]["dataset_name"]
        client = bigquery.Client()
        source_bucket_query = f"""SELECT DISTINCT REGEXP_EXTRACT(source_name, r'^gs:\/\/([a-z0-9\-]+)\/') AS source_bucket FROM `{bq_project}.{bq_schema}.datarepo_load_history` WHERE state = 'succeeded'"""
        try:
            df_output = client.query(source_bucket_query).result().to_dataframe()
            for i in range(0, len(df_output)):
                source_bucket = df_output["source_bucket"].values[i]
                status = "Success"
                record = [dataset_entry.id, source_bucket, status]
                records_list.append(record)
        except:
            source_bucket = ""
            status = "Error"
            record = [dataset_entry.id, source_bucket, status]
            records_list.append(record)
        
# Read records into a dataframe
df = pd.DataFrame(records_list, columns =["Dataset UUID", "Source Bucket", "Retrieval Status"])
df_sorted = df.sort_values(["Dataset UUID", "Source Bucket"], ascending=[True, True], ignore_index=True)
print(f"End time: {datetime.datetime.now()}")
display(df_sorted)

### Clean Up Outdated AnVIL TDR Service Accounts

In [None]:
terra_url = "https://api.firecloud.org"
valid_sa_list = [
]

# Establish credentials
creds, project = google.auth.default()
auth_req = google.auth.transport.requests.Request()
creds.refresh(auth_req)

# Get current anvil_tdr_ingest membership
group = "anvil_tdr_ingest"
group_members = requests.get(
    url=f"{terra_url}/api/groups/{group}",
    headers={"Authorization": f"Bearer {creds.token}"}
).json()

# Loop through anvil_tdr_ingest membership and remove outdated users
user_cnt = 0
success_cnt = 0
for member in group_members["membersEmails"]:
    if "tdr-ingest-sa" in member and member not in valid_sa_list:
        user_cnt += 1
        response = requests.delete(
            url=f"{terra_url}/api/groups/{group}/member/{member}",
            headers={"Authorization": f"Bearer {creds.token}"}
        )
        if response.status_code == 204:
            success_cnt += 1
print(f"Group ({group}) clean-up: ")
print(f"\t- Users to remove: {user_cnt}")
print(f"\t- Users removed successfully: {success_cnt}")

# Get current workspace membership
ws_members = requests.get(
    url=f"{terra_url}/api/workspaces/{ws_project}/{ws_name}/acl",
    headers={"Authorization": f"Bearer {creds.token}"}
).json()

# Loop through workspace membership and remove outdated users
user_cnt = 0
success_cnt = 0
for member in ws_members["acl"].keys():
    if "tdr-ingest-sa" in member and member not in valid_sa_list:
        user_cnt += 1
        payload = [{
            "email": member,
            "accessLevel": "NO ACCESS",
            "canShare": False,
            "canCompute": False
        }]
        response = requests.patch(
            url=f"{terra_url}/api/workspaces/{ws_project}/{ws_name}/acl",
            headers={"Authorization": f"Bearer {creds.token}"}, 
            json=payload
        )
        if response.status_code == 200:
            success_cnt += 1
print(f"Workspace ({ws_project}/{ws_name}) clean-up: ")
print(f"\t- Users to remove: {user_cnt}")
print(f"\t- Users removed successfully: {success_cnt}")



## Clean Up Processing Directories

In [None]:
!gsutil -u anvil-datastorage -m rm -r gs://fc-2a9eefc3-0302-427f-9ac3-82f078741c03/ingest_pipeline/input

In [None]:
!gsutil -u anvil-datastorage -m rm -r gs://fc-2a9eefc3-0302-427f-9ac3-82f078741c03/ingest_pipeline/output/source

In [None]:
!gsutil -u anvil-datastorage -m rm -r gs://fc-2a9eefc3-0302-427f-9ac3-82f078741c03/ingest_pipeline/output/transformed

## Other Misc

In [None]:
!gsutil -u anvil-datastorage ls gs://fc-secure-33cad843-3453-42ea-bf50-0eda2b52171d