In [None]:
# Version History
#print("Version 1.0.0: 09/23/2022 5:06pm - Nate Calvanese - First version created")
#print("Version 1.0.1: 09/26/2022 11:18m - Nate Calvanese - Fixed bug in default dataset naming")
#print("Version 1.0.2: 09/27/2022 2:43pm - Nate Calvanese - Added ability to aggregate multiple workspaces into one dataset")
#print("Version 1.0.3: 10/5/2022 1:32pm - Nate Calvanese - Added support for chunking up ingest requests")
#print("Version 1.0.4: 10/6/2022 10:35am - Nate Calvanese - Updated use of TDR utility functions")
#print("Version 1.0.5: 10/13/2022 10:54am - Nate Calvanese - Parameter tweaks for latest changes")
#print("Version 1.0.6: 10/21/2022 10:53am - Nate Calvanese - Version stamp for latest changes to supporting notebooks")
#print("Version 1.0.7: 10/24/2022 4:58pm - Nate Calvanese - Added support for project entity name derivation")
#print("Version 1.0.8: 10/26/2022 4:24pm - Nate Calvanese - Added support for batching mapping activities in section 3")
#print('Version 1.0.9: 2/21/2023 2:50pm - Nate Calvanese - Added support for $BQ_DATASET substitution variable in mapping section')
#print('Version 1.0.10: 3/8/2023 8:17am - Nate Calvanese - Performance improvements')
print('Version 1.0.11: 7/11/2023 8:17am - Nate Calvanese - Added auth domain back as reader on snapshots')



# Imports and Common Variables

In [None]:
# Install additional modules (one time effort per cloud environment)
!pip install --upgrade pip import_ipynb data_repo_client urllib3 xmltodict
#!pip install data_repo_client==1.409.0

In [None]:
# Workspace environment variables
import os
import re
print("Recording workspace environment variables:")
ws_name = os.environ["WORKSPACE_NAME"]
ws_project = os.environ["WORKSPACE_NAMESPACE"]
ws_bucket = os.environ["WORKSPACE_BUCKET"]
ws_bucket_name = re.sub('^gs://', '', ws_bucket)
print(f"Workspace name = {ws_name}")
print(f"Workspace project = {ws_project}")
print(f"Workspace bucket = {ws_bucket}")
print(f"Workspace bucket name = {ws_bucket_name}")

# Copy latest version of the pipeline notebooks to the cloud environment (uncomment if any notebooks have changed since last run)
# print("\nCopying latest pipeline notebooks to the cloud environment:")
# !gsutil -m cp $ws_bucket/notebooks/*.ipynb .

# Additional imports
print("\nRunning imports:")
import import_ipynb
import pandas as pd
from firecloud import api as fapi
import data_repo_client
import ingest_pipeline_utilities as utils
import build_mapping_query as bmq
from google.cloud import storage
from google.cloud import bigquery
import google.auth
import google.auth.transport.requests
import logging
import datetime
import json
import sys
from time import sleep
import requests

# Common pipeline variables (AnVIL)
ws_attributes = utils.get_workspace_attributes(ws_project, ws_name)
params = {}
params["ws_name"] = ws_name
params["ws_project"] = ws_project
params["ws_bucket"] = ws_bucket
params["ws_bucket_name"] = ws_bucket_name
params["profile_id"] = "e0e03e48-5b96-45ec-baa4-8cc1ebf74c61"  # Dev profile: 
params["google_project"] = ws_attributes["googleProject"]
params["create_file_table"] = True
params["file_table_name"] = "file_inventory"
params["global_file_exclusions"] = ["SubsetHailJointCall"]

# Configure logging format
while logging.root.handlers:
    logging.root.removeHandler(logging.root.handlers[-1])
logging.basicConfig(format="%(asctime)s - %(levelname)s: %(message)s", datefmt="%m/%d/%Y %I:%M:%S %p", level=logging.INFO, handlers=[logging.StreamHandler(sys.stdout)])
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option("display.max_colwidth", None)
pd.set_option('display.width', 1000)
pd.set_option('display.colheader_justify', 'center')
pd.set_option('display.precision', 3)


# "EL" Pipeline: Load Dataset to TDR in Source Format

## Pipeline Run Variables

In [None]:
## >>> Run Variables <<<
# For datasets split across multiple workspaces, set the staging area and target TDR dataset to the 
# same value to collect all of the source data and process it together.
workspace_run_list = [
    #["Workspace_Name", "Workspace_Project", "Staging Area (Leave empty for default)", "Target_TDR_Dataset_Name (Leave empty for default)", Run (True/False)]
    ["AnVIL_WS_1", "anvil-datastorage", "", "", False],
    ["AnVIL_WS_2", "anvil-datastorage", "", "", False],
]
params["skip_source_files_creation"] = False
params["skip_file_inventory_creation"] = False
params["skip_table_data_processing"] = False
params["skip_ingests"] = False
params["skip_snapshot_creation"] = True
params["snapshot_readers_list"] = [] # Include "auth-domain" to add the auth domain(s) as a reader (if one exists)


## >>> File Inventory Variables <<<
# The GCS bucket associated with the source workspace will be automatically included in the file inventory build. To specify 
# additional GCS buckets to include in the file inventory build, add entries to the below dictionary.
params["additional_file_inventory_sources"] = {}
# EXAMPLE:
# params["additional_file_inventory_sources"] = {
#     "staging_area": {
#         "bucket_name": {
#             "include_dirs": [], # Leave empty to include all directories in bucket
#             "exclude_dirs": [] # Exclusions will take precedence over inclusions
#         }
#     }
# }


## >>> Ingest Variables <<<
# For cases where you only want to ingest a subset of files, use the below dictionary to specify exactly what should be ingested.
params["ingest_list_override"] = {
}
# EXAMPLE:
# params["ingest_list_override"] = {
#     "ws_table": ["ws_table_0.json"], # Leave empty to run ingest for every file for target table
# }


## >>> File Reference Variables <<<
# Fields containing GCS links will be identified automatically by the pipeline. The below dict should contain any fields
# that contain file references that aren't proper GCS links in the workspace tables.
data_file_refs_dict = {   
    "sequencing": [{
        "column": "sequencing_id",
        "method": "file_path_match",
        "match_multiple_files": False, 
        "match_regex": None,
        "match_type": "partial",
        "mode": "fileref_in_line",
        "create_new_field": True,
        "new_field_name": "sequencing_id_fileref"
    }]
}
# Definitions:
#    Required Fields: column, method, mode, create_new field
#    Optional Fields: match_multiple_files (default to True), match_regex (default to None), match_type (default to 'partial'), new_field_name (default to None)
#    Methods: 
#       file_path_match -- Field contains a full or partial file path, which can be matched to the file inventory to grab the file(s) referenced 
#       tdr_file_id -- Field contains file UUIDs of files already ingested into the target TDR dataset
#    Modes:
#       fileref_in_line -- Populates the field with a file reference object
#       fileref_table_ref -- Populates the field with an ID that joins to a file table. If no file table built, falls back on fileref_in_line logic.
    
#-----------------------------------------------------------------------------------------------------------#
    
# Print variables
print("Pipeline run variables set:")
print("Profile ID: " + params["profile_id"])
print("Ingests to run: ")
current_datetime = datetime.datetime.now()
current_date_string = current_datetime.strftime("%Y%m%d")
for workspace in workspace_run_list:
    if workspace[4] == True:
        ws_attributes = utils.get_workspace_attributes(workspace[1], workspace[0])
        params["phs_id"] = utils.format_phs_id(ws_attributes["attributes"]["phs_id"]) if ws_attributes["attributes"].get("phs_id") else ""
        auth_list = ws_attributes["authorizationDomain"] if ws_attributes.get("authorizationDomain") else []
        params["auth_domains"] = [x["membersGroupName"] for x in auth_list]
        params["consent_name"] = ws_attributes["attributes"]["library:dataUseRestriction"] if ws_attributes["attributes"].get("library:dataUseRestriction") else ""
        params["data_files_src_bucket"] = ws_attributes["bucketName"] if ws_attributes.get("bucketName") else ""
        workspace[3] = workspace[3] if workspace[3] else utils.format_dataset_name(workspace[0])
        workspace[2] = workspace[2] if workspace[2] else workspace[0]
        print("- Workspace [" + workspace[1] + "/" + workspace[0] + "] to TDR dataset [" + workspace[3] + "] via Staging Area [" + workspace[2] + "]")
        print("\t- PHS ID = " + params["phs_id"])
        print("\t- Consent Short Name = " + params["consent_name"])
        print("\t- Auth Domains = " + str(params["auth_domains"]))
        print("\t- Data Files Source Bucket = " + params["data_files_src_bucket"])
print("Skip source files creation? " + str(params["skip_source_files_creation"]))
print("Skip file inventory creation? " + str(params["skip_file_inventory_creation"]))
print("Skip table data processing? " + str(params["skip_table_data_processing"]))
print("Skip ingests? " + str(params["skip_ingests"]))
print("Ingest override list: " + str(params["ingest_list_override"]))
print("Skip snapshot creation? " + str(params["skip_snapshot_creation"]))


## Pipeline Execution

In [None]:
# Loop through and execute workspace connector pipeline ("E") for listed workspaces
if params["skip_source_files_creation"] == True:
    logging.info("Skipping source file creation, per user request.")
else:
    for workspace in workspace_run_list:
        if workspace[4] == True:
            params["data_file_refs"] = data_file_refs_dict  
            utils.run_ws_connector_pipeline(workspace, params)

# Aggregate staging area to target dataset combinations, loop through them, and execute ingest pipeline ("L")
pipeline_run_list = []
for workspace in workspace_run_list:
    if workspace[4] == True:
        temp_list = [workspace[2], workspace[3]]
        if temp_list not in pipeline_run_list:
            pipeline_run_list.append(temp_list)
for pipeline in pipeline_run_list:
    utils.run_el_pipeline(pipeline, params)


# Mapping Development
Work through the following steps for each dataset that needs to be processed through the transformation pipeline in Step 4, specifying the target schema ("mapping target") and mapping specification ("mapping_target_spec") you would like to use for transformation. Note that you can use the logs or results_dict from the previous step to retrieve the dataset_id values of interest, or retrieve them directly from TDR via the UI or Swagger.

## Dataset Mapping Variables

In [None]:
## >>> Mapping Variables <<<
# For each dataset specified, include an appropriate mapping target and mapping target specification
datasets_to_map_list = [
    #["dataset_id", "mapping_target", "mapping_target_spec", Run (True/False)]
    ["1234", "anvil", "cmg_ext_2", False],
]

#-----------------------------------------------------------------------------------------------------------#
    
# Print variables
print("Datasets to map: ")
api_client = utils.refresh_tdr_api_client()
api_client.client_side_validation = False
datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
final_datasets_to_map_dict = {}
skip_dataset_list_access = []
skip_dataset_list_mapping = []
storage_client = storage.Client()
bucket = storage_client.get_bucket(ws_bucket_name)
for dataset in datasets_to_map_list:
    if dataset[3]:
        dataset_id = dataset[0]
        mapping_target = dataset[1]
        mapping_target_spec = dataset[2]
        try:
            dataset_info = datasets_api.retrieve_dataset(id=dataset_id, include=["SCHEMA", "ACCESS_INFORMATION", "PROPERTIES"]).to_dict()
            dataset_name = dataset_info["name"]
        except:
            dataset_name = ""
            skip_dataset_list_access.append(dataset_id)
        try:
            blob = bucket.blob("ingest_pipeline/mapping/{}/mapping_schema_object.json".format(mapping_target))
            content = json.loads(blob.download_as_string(client=None))
            blob = bucket.blob("ingest_pipeline/mapping/{}/{}/mapping_specification.json".format(mapping_target, mapping_target_spec))
            content = json.loads(blob.download_as_string(client=None))
        except:
            skip_dataset_list_mapping.append(dataset_id)
        if dataset_id not in skip_dataset_list_access and dataset_id not in skip_dataset_list_mapping:
            final_datasets_to_map_dict[dataset_id] = {}
            final_datasets_to_map_dict[dataset_id]["mapping_target"] = mapping_target 
            final_datasets_to_map_dict[dataset_id]["mapping_target_spec"] = mapping_target_spec
            print("\t- " + dataset_name + " ({})".format(dataset_id) + " with {}/{}".format(mapping_target, mapping_target_spec))
if skip_dataset_list_access:
    print("Datasets to skip due to non-existence or inaccessibility to the current user:")
    print("\t- " + "\n\t- ".join(skip_dataset_list_access))
if skip_dataset_list_mapping:
    print("Datasets to skip due to invalid mapping target or mapping target specification:")
    print("\t- " + "\n\t- ".join(skip_dataset_list_mapping))   


## Add Missing Relationships to TDR Dataset Schema
Relationships are needed by the mapping query constructor to build appropriate joins between tables. If no joins are required between tables, this step is unnecessary. 

In [None]:
# Record relationships to potentially add to the source datasets. Note that there may be more relationships to add
# than those listed below, so add to this list as necessary.
potential_relationships = [
    ["subject.family_id", "family.family_id"],
    ["sample.subject_id", "subject.subject_id"],
    ["sample.t_01_subject_id", "subject.subject_id"],
    ["sequencing.sample_id", "sample.sample_id"],
    ["sequencing.sample", "sample.sample_id"],
    ["sequencing.sample_alias", "sample.sample_id"],
    ["sample.participant", "participant.participant_id"],
    ["sample.participant_id", "participant.participant_id"],
    ["discovery.sample_id", "sample.sample_id"],
    ["discovery.subject_id", "subject.subject_id"],
    ["qc_result_sample.qc_result_sample_id", "sample.sample_id"],
    ["interval.chromosome", "chromosome.chromosome_id"]
]

# Loop through datasets and process potential relationship additions
api_client = utils.refresh_tdr_api_client()
datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
for dataset_id in final_datasets_to_map_dict:
    print("Processing potential relationships for dataset_id = {}".format(dataset_id))
    
    # Retrieve source schema
    src_schema_dict = {}
    try:
        datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
        response = datasets_api.retrieve_dataset(id=dataset_id, include=["SCHEMA", "ACCESS_INFORMATION"]).to_dict()
        src_schema_dict["tables"] = response["schema"]["tables"]
        src_schema_dict["relationships"] = response["schema"]["relationships"]
    except Exception as e:
        print("Error retrieving source schema from TDR. Error: {}".format(e))

    # Loop through potential relationships and add those present for the source dataset
    additional_relationships = []
    for rel in potential_relationships:
        from_table = rel[0].split(".")[0] 
        from_column = rel[0].split(".")[1]
        to_table = rel[1].split(".")[0]
        to_column = rel[1].split(".")[1]
        if bmq.confirm_column_exists(src_schema_dict, from_table, from_column) and bmq.confirm_column_exists(src_schema_dict, to_table, to_column):
            relationship_found = False
            for rel_entry in src_schema_dict["relationships"]:
                if rel_entry["_from"]["table"] == from_table and rel_entry["_from"]["column"] == from_column and rel_entry["to"]["table"] == to_table and rel_entry["to"]["column"] == to_column:
                    relationship_found = True
                elif rel_entry["_from"]["table"] == to_table and rel_entry["_from"]["column"] == to_column and rel_entry["to"]["table"] == from_table and rel_entry["to"]["column"] == from_column:
                    relationship_found = True
            if not relationship_found:
                rel_dict = {
                    "name": from_table + "_" + from_column + "__to__" + to_table + "_" + to_column,
                    "from": {"table": from_table, "column": from_column},
                    "to": {"table": to_table, "column": to_column}
                }
                additional_relationships.append(rel_dict)

    # Submit the schema update request for the TDR dataset
    if additional_relationships:
        schema_update_request = {
            "description": "Adding relationships to support query construction.",
            "changes": {
                "addRelationships": additional_relationships
            }
        }
        try:
            resp = utils.wait_for_tdr_job(datasets_api.update_schema(id=dataset_id, dataset_schema_update_model=schema_update_request))
            print("Schema update successful: " + str(resp)[0:1000])
        except Exception as e:
            print("Error running schema update: " + str(e))
    else:
        print("No additional relationships to add to schema.")

print("Processing of potential relationships for specified datasets complete.")


## Retrieve Mapping Artifacts and Run Query Construction
Retrieve the artifacts you would like to use to construct transformation queries for your datasets, based on the previously specified target schema and mapping specification. These transformation queries will then be dynamically constructed based on the appropriate target schema, mapping specification, and source schema. 

In [None]:
# Loop through datasets and process transformation query construction
api_client = utils.refresh_tdr_api_client()
datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
for dataset_id in final_datasets_to_map_dict:
    print("Building transformation queries for dataset_id = {}".format(dataset_id))

    # Collect mapping variables
    mapping_target = final_datasets_to_map_dict[dataset_id]["mapping_target"]
    mapping_target_spec = final_datasets_to_map_dict[dataset_id]["mapping_target_spec"]
    
    # Retrieve source schema
    src_schema_dict = {}
    try:
        datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
        response = datasets_api.retrieve_dataset(id=dataset_id, include=["SCHEMA", "ACCESS_INFORMATION"]).to_dict()
        src_schema_dict["name"] = response["name"]
        src_schema_dict["tables"] = response["schema"]["tables"]
        src_schema_dict["relationships"] = response["schema"]["relationships"]
        bq_project = response["access_information"]["big_query"]["project_id"]
        bq_schema = response["access_information"]["big_query"]["dataset_name"]
        phs_id = response["phs_id"]
    except Exception as e:
        print("Error retrieving source schema from TDR. Error: {}".format(e))

    # Set dataset name and project name parameters to substitute into transform queries
    dataset_name_value = re.sub("(_[0-9]+$)", "", src_schema_dict["name"])
    project_name_value = re.sub("'", "", utils.derive_project_name(dataset_id, phs_id, dataset_name_value))

    # Retrieve target schema and mapping specification
    target_schema_dict = {}
    mapping_spec = {}
    storage_client = storage.Client()
    bucket = storage_client.get_bucket(ws_bucket_name)
    try:
        blob = bucket.blob(f"ingest_pipeline/mapping/{mapping_target}/mapping_schema_object.json")
        target_schema_dict = json.loads(blob.download_as_string(client=None))
    except Exception as e:
        print("Error retrieving target schema for specified mapping_target. Error: {}".format(e))
    try:
        blob = bucket.blob(f"ingest_pipeline/mapping/{mapping_target}/{mapping_target_spec}/mapping_specification.json")
        blob_string = blob.download_as_text(client=None)
        blob_string = blob_string.replace("$DATASET_NAME", dataset_name_value)
        blob_string = blob_string.replace("$PROJECT_NAME", project_name_value)
        blob_string = blob_string.replace("$BQ_DATASET", bq_project + "." + bq_schema)
        mapping_spec = json.loads(blob_string)
    except Exception as e:
        print("Error retrieving mapping specification for specified mapping_target and mapping_target_spec. Error: {}".format(e))

    # Build queries from mapping specification
    query_dict = {}
    if target_schema_dict:
        for target_table in target_schema_dict["tables"]:
            table_name = target_table["name"]
            missing_artifacts = False
            if src_schema_dict and mapping_spec:
                query_dict[table_name] = bmq.build_mapping_query(target_table, src_schema_dict, mapping_spec, bq_project, bq_schema)
            else:
                missing_artifacts = True
                query_dict[table_name] = {"query": "", "syntax_check": ""} 
        if missing_artifacts == True:
            print("Source schema dictionary and/or mapping specification missing. Unable to generate queries.")
    else:
        print("Target schema dictionary missing. Unable to generate queries.")
    
    # Evaluate queries -- Publish if no issues found, otherwise convert to dataframe and display
    failure_count = 0
    for key, val in query_dict.items():
        if val["syntax_check"] != "Passed" and val["syntax_check"] != None:
            failure_count += 1
    if failure_count == 0:
        print("No failures found in query construction, publishing to the cloud.")
        # Copy target schema file to output folder for mapping target
        source_path = "ingest_pipeline/mapping/{}/mapping_schema_object.json".format(mapping_target)
        destination_path = "ingest_pipeline/output/transformed/{}/{}/schema/mapping_schema_object.json".format(mapping_target, dataset_id)
        !gsutil cp $ws_bucket/$source_path $ws_bucket/$destination_path 2> stdout

        # Limit query dict to valid queries, write out, and copy to output folder for mapping target
        valid_query_dict = {}
        for target, val in query_dict.items():
            if val["syntax_check"] == "Passed":
                valid_query_dict[target] = val
        final_query_dict = {
            "dataset_id": dataset_id,
            "transforms": valid_query_dict
        }
        query_dict_json = json.dumps(final_query_dict)
        query_output_file = "transform_query_set.json"
        with open(query_output_file, 'w') as outfile:
            outfile.write(query_dict_json)
        destination_path = "ingest_pipeline/output/transformed/{}/{}/queries".format(mapping_target, dataset_id)
        !gsutil cp $query_output_file $ws_bucket/$destination_path/ 2> stdout
    else:
        print("Failures found in query construction, must be resolved before publishing.")
        print("Query building results:")
        query_df = pd.DataFrame.from_dict(query_dict, orient="index")
        query_df.index.name = "target_table"
        query_df.reset_index(inplace=True)
        display(query_df)

print("Transformation query construction and processing complete.")
    

## Evaluate Vocabulary Mapping
For target attributes leveraging the "VOCAB_MAP" transformation, evaluate whether the source values have a record in the dsp-data-ingest.transform_resources.vocab_map table. If additional mappings are needed, these should be put into place before the transformation queries are executed.

In [None]:
# Set display parameter
show_only_missing_maps = True

# Loop through datasets and process vocabulary mapping evaluation
api_client = utils.refresh_tdr_api_client()
datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
for dataset_id in final_datasets_to_map_dict:
    print("Evaluating vocabulary mapping for dataset_id = {}".format(dataset_id))

    # Collect mapping variables
    mapping_target = final_datasets_to_map_dict[dataset_id]["mapping_target"]
    mapping_target_spec = final_datasets_to_map_dict[dataset_id]["mapping_target_spec"]
    
    # Retrieve source schema
    src_schema_dict = {}
    try:
        datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
        response = datasets_api.retrieve_dataset(id=dataset_id, include=["SCHEMA", "ACCESS_INFORMATION"]).to_dict()
        src_schema_dict["name"] = response["name"]
        src_schema_dict["tables"] = response["schema"]["tables"]
        src_schema_dict["relationships"] = response["schema"]["relationships"]
        bq_project = response["access_information"]["big_query"]["project_id"]
        bq_schema = response["access_information"]["big_query"]["dataset_name"]
        phs_id = response["phs_id"]
    except Exception as e:
        print("Error retrieving source schema from TDR. Error: {}".format(e))

    # Set dataset name and project name parameters to substitute into transform queries
    dataset_name_value = re.sub("(_[0-9]+$)", "", src_schema_dict["name"])
    project_name_value = utils.derive_project_name(dataset_id, phs_id, dataset_name_value)

    # Retrieve target schema and mapping specification
    target_schema_dict = {}
    mapping_spec = {}
    storage_client = storage.Client()
    bucket = storage_client.get_bucket(ws_bucket_name)
    try:
        blob = bucket.blob(f"ingest_pipeline/mapping/{mapping_target}/mapping_schema_object.json")
        target_schema_dict = json.loads(blob.download_as_string(client=None))
    except Exception as e:
        print("Error retrieving target schema for specified mapping_target. Error: {}".format(e))
    try:
        blob = bucket.blob(f"ingest_pipeline/mapping/{mapping_target}/{mapping_target_spec}/mapping_specification.json")
        blob_string = blob.download_as_text(client=None)
        blob_string = blob_string.replace("$DATASET_NAME", dataset_name_value)
        blob_string = blob_string.replace("$PROJECT_NAME", project_name_value)
        mapping_spec = json.loads(blob_string)
    except Exception as e:
        print("Error retrieving mapping specification for specified mapping_target and mapping_target_spec. Error: {}".format(e))

    # Evaluate vocab mapping and display results
    df = bmq.evaluate_vocab_mapping(mapping_spec, src_schema_dict, target_schema_dict, bq_project, bq_schema)
    print("-------------------------------------------")
    print("Missing mapped_value view:")
    print("-------------------------------------------")
    display(df[df["mapped_value"].isnull() & df["source_value"].notnull()])
    if not show_only_missing_maps:
        print("\n-------------------------------------------")
        print("Full view:")
        print("-------------------------------------------")
        display(df)
    
print("Vocabulary mapping evaluation and processing complete.")


## [Optional] Update/Override Generated Queries as Necessary
Review any queries that have not passed the syntax check, as these need to be remedied before they can be published and executed. Any other queries that do not align with expectations can be overridden by either A) Updating the mapping target specification and re-running the previous step, or B) Manually overriding the query below. Option B should only be used in one-off cases.

### Build Base Query Dictionary

In [None]:
# Input the appropriate dataset and mapping target specification
dataset_id = "f1e1ef01-d52d-423e-a65b-3a1d26c7ee9d"
mapping_target = "anvil"
mapping_target_spec = "cmg_ext_2"

# Retrieve source schema
src_schema_dict = {}
api_client = utils.refresh_tdr_api_client()
datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
try:
    datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
    response = datasets_api.retrieve_dataset(id=dataset_id, include=["SCHEMA", "ACCESS_INFORMATION"]).to_dict()
    src_schema_dict["name"] = response["name"]
    src_schema_dict["tables"] = response["schema"]["tables"]
    src_schema_dict["relationships"] = response["schema"]["relationships"]
    bq_project = response["access_information"]["big_query"]["project_id"]
    bq_schema = response["access_information"]["big_query"]["dataset_name"]
    phs_id = response["phs_id"]
except Exception as e:
    print("Error retrieving source schema from TDR. Error: {}".format(e))

# Set dataset name and project name parameters to substitute into transform queries
dataset_name_value = re.sub("(_[0-9]+$)", "", src_schema_dict["name"])
project_name_value = utils.derive_project_name(dataset_id, phs_id, dataset_name_value)

# Retrieve target schema and mapping specification
target_schema_dict = {}
mapping_spec = {}
storage_client = storage.Client()
bucket = storage_client.get_bucket(ws_bucket_name)
try:
    blob = bucket.blob(f"ingest_pipeline/mapping/{mapping_target}/mapping_schema_object.json")
    target_schema_dict = json.loads(blob.download_as_string(client=None))
except Exception as e:
    print("Error retrieving target schema for specified mapping_target. Error: {}".format(e))
try:
    blob = bucket.blob(f"ingest_pipeline/mapping/{mapping_target}/{mapping_target_spec}/mapping_specification.json")
    blob_string = blob.download_as_text(client=None)
    blob_string = blob_string.replace("$DATASET_NAME", dataset_name_value)
    blob_string = blob_string.replace("$PROJECT_NAME", project_name_value)
    mapping_spec = json.loads(blob_string)
except Exception as e:
    print("Error retrieving mapping specification for specified mapping_target and mapping_target_spec. Error: {}".format(e))

# Build queries from mapping specification
query_dict = {}
if target_schema_dict:
    for target_table in target_schema_dict["tables"]:
        table_name = target_table["name"]
        missing_artifacts = False
        if src_schema_dict and mapping_spec:
            query_dict[table_name] = bmq.build_mapping_query(target_table, src_schema_dict, mapping_spec, bq_project, bq_schema)
        else:
            missing_artifacts = True
            query_dict[table_name] = {"query": "", "syntax_check": ""} 
    if missing_artifacts == True:
        print("Source schema dictionary and/or mapping specification missing. Unable to generate queries.")
else:
    print("Target schema dictionary missing. Unable to generate queries.")
    
# Display query dictionary
query_df = pd.DataFrame.from_dict(query_dict, orient="index")
query_df.index.name = "target_table"
query_df.reset_index(inplace=True)
display(query_df)
    


### Update Query Dict as Necessary

In [None]:
# To update the query definition for particular target table, input the target table and query below
target_table = "anvil_donor"
query = "SELECT 1"

# Run syntax check
query_dict[target_table]["query"] = query
query_dict[target_table]["syntax_check"] = bmq.run_syntax_check(query)
print(query_dict[target_table])


### Publish Updated Query Dict

In [None]:
# Copy target schema file to output folder for mapping target
source_path = "ingest_pipeline/mapping/{}/mapping_schema_object.json".format(mapping_target)
destination_path = "ingest_pipeline/output/transformed/{}/{}/schema/mapping_schema_object.json".format(mapping_target, dataset_id)
!gsutil cp $ws_bucket/$source_path $ws_bucket/$destination_path 2> stdout

# Limit query dict to valid queries, write out, and copy to output folder for mapping target
valid_query_dict = {}
for target, val in query_dict.items():
    if val["syntax_check"] == "Passed":
        valid_query_dict[target] = val
final_query_dict = {
    "dataset_id": dataset_id,
    "transforms": valid_query_dict
}
query_dict_json = json.dumps(final_query_dict)
query_output_file = "transform_query_set.json"
with open(query_output_file, 'w') as outfile:
    outfile.write(query_dict_json)
destination_path = "ingest_pipeline/output/transformed/{}/{}/queries".format(mapping_target, dataset_id)
!gsutil cp $query_output_file $ws_bucket/$destination_path/ 2> stdout

# "T" Pipeline: Load Additional Transformed Tables to TDR

## Pipeline Run Variables

In [None]:
# Run Variables
dataset_id_run_list = [
    #["dataset_id", Run (True/False)],
    ["1234", False],
]
params["mapping_target"] = "anvil"
params["skip_transforms"] = False
params["transform_list_override"] = [] # Leave empty to run transforms for all files, otherwise populate with target table names 
params["skip_schema_extension"] = False
params["skip_ingests"] = False
params["ingest_list_override"] = [] # Leave empty to run ingests for all files, otherwise populate with target table names
params["skip_file_relation_inference"] = False
params["skip_dangling_fk_resolution"] = False
params["skip_supplementary_file_identification"] = False
params["skip_snapshot_creation"] = False
params["snapshot_readers_list"] = ["azul-anvil-prod@firecloud.org", "auth-domain"] # Include "auth-domain" to add the auth domain(s) as a reader (if one exists)
params["skip_data_validation"] = False

#-----------------------------------------------------------------------------------------------------------#

# Print variables
print("Pipeline run variables set:")
print("Profile ID: " + params["profile_id"])
print("Mapping Target: " + params["mapping_target"])
print("Datasets to run: ")
api_client = utils.refresh_tdr_api_client()
datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
dataset_skip_list = []
for dataset in dataset_id_run_list:
    if dataset[1]:
        dataset_id = dataset[0]
        try:
            dataset_info = datasets_api.retrieve_dataset(id=dataset_id, include=["SCHEMA", "ACCESS_INFORMATION", "PROPERTIES"]).to_dict()
            dataset_name = dataset_info["name"]
            phs_id = dataset_info["phs_id"]
            consent_name = dataset_info["properties"]["consent_name"]
            auth_domains = dataset_info["properties"]["auth_domains"]
            src_workspaces = dataset_info["properties"]["source_workspaces"]
        except:
            dataset_name = ""
            dataset_skip_list.append(dataset_id)
        if dataset_name:
            dataset_id = dataset[0]
            print("- " + dataset_name + " ({})".format(dataset_id))
            print("\t- PHS ID = " + phs_id)
            print("\t- Consent Short Name = " + consent_name)
            print("\t- Auth Domains = " + str(auth_domains))
            print("\t- Source Workspaces = " + str(src_workspaces))
if dataset_skip_list:
    print("Datasets to skip (they either don't exist or aren't accessible to the current user): ")
    print("\t- " + "\n\t- ".join(dataset_skip_list)) 
print("Skip transforms? " + str(params["skip_transforms"]))
print("Skip schema extension? " + str(params["skip_schema_extension"]))
print("Skip ingests? " + str(params["skip_ingests"]))
print("Skip file relationship inference? " + str(params["skip_file_relation_inference"]))
print("Skip dangling foreign key resolution? " + str(params["skip_dangling_fk_resolution"]))
print("Skip snapshot creation? " + str(params["skip_snapshot_creation"]))
print("Skip data validation? " + str(params["skip_data_validation"]))


## Pipeline Execution

In [None]:
# Loop through and execute pipeline for listed workspaces
for dataset in dataset_id_run_list:
    if dataset[1]:
        dataset_id = dataset[0]
        try:
            api_client = utils.refresh_tdr_api_client()
            datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
            dataset_info = datasets_api.retrieve_dataset(id=dataset_id, include=["SCHEMA", "ACCESS_INFORMATION", "PROPERTIES"]).to_dict()
            dataset_name = dataset_info["name"]
            phs_id = dataset_info["phs_id"]
            consent_name = dataset_info["properties"]["consent_name"]
            auth_domains = dataset_info["properties"]["auth_domains"]
            src_workspaces = dataset_info["properties"]["source_workspaces"]
        except:
            dataset_name = ""
        if dataset_name:
            params["dataset_id"] = dataset_id
            params["dataset_name"] = dataset_name
            params["phs_id"] = phs_id
            params["consent_name"] = consent_name
            params["auth_domains"] = auth_domains
            utils.run_t_pipeline(params)
        

# Utility Scripts
Uncomment sections as necessary to accomplish various miscellaneous tasks.

## Cloud Storage Clean-up

In [None]:
# Set the name, mapping target, and target dataset_id of the source workspace whose table data files should be removed 
clean_staging_area_data = True
staging_area_name = ""
clean_transformed_data = True
mapping_target = "anvil"
dataset_id = "1234"

#--------------------------------------------------------------------------------------------------------

# gsutil commands to remove dataset table data and data file inventories where they live
if clean_staging_area_data:
    !gsutil -m rm -r $ws_bucket/ingest_pipeline/input/$workspace_name
    !gsutil -m rm -r $ws_bucket/ingest_pipeline/output/source/$workspace_name/table_data
if clean_transformed_data:
    !gsutil -m rm -r $ws_bucket/ingest_pipeline/output/transformed/$mapping_target/$dataset_id/table_data


## Soft Deletion of TDR Dataset Records

In [None]:
# Input parameters
dataset_id_list = [
'8b2b1c92-66cf-403c-8eb0-03b523d1550e',
]
table_list = ["anvil_activity", "anvil_alignmentactivity", "anvil_antibody", "anvil_assayactivity", "anvil_biosample", "anvil_dataset", "anvil_diagnosis", "anvil_donor", "anvil_file", "anvil_project", "anvil_sequencingactivity", "anvil_variantcallingactivity"]
#table_list = ["file_inventory", "sample", "subject", "workspace_attributes", "sequencing", "qc_result_sample", "family", "chromosome", "interval", "participant", "discovery", "sample_set", "vcf"]
#table_list = ['sequencing']
delete_all_records = True
delete_record_list = [] # Will be ignored if delete_all_records is set to True

#--------------------------------------------------------------------------------------------------------

# Setup Google creds and establish TDR clients
api_client = utils.refresh_tdr_api_client()
datasets_api = data_repo_client.DatasetsApi(api_client=api_client)

# Function to delete rows from a dataset
def delete_datarepo_rows(dataset_id, table_name, datarepo_row_ids):
    print("Attempting to delete specified rows from {} for dataset {}".format(table_name, dataset_id))
    if datarepo_row_ids:
        data_deletion_payload = {
            "deleteType": "soft",
            "specType": "jsonArray",
            "tables": [{
              "tableName": table_name,
              "jsonArraySpec": {
                "rowIds": datarepo_row_ids
              }
            }]
        }
        try:
            data_deletion_result, job_id = utils.wait_for_tdr_job(datasets_api.apply_dataset_data_deletion(id=dataset_id, data_deletion_request=data_deletion_payload))
            print("Result: {}".format(data_deletion_result))
        except Exception as e:
            print("Error: {}".format(str(e)))
    else:
        print("No datarepo_row_ids specified for deletion.")

# Function to collect all datarepo rows for a particular table within a dataset
def collect_all_datarepo_rows(dataset_id, table_name):
    try:
        response = datasets_api.retrieve_dataset(id=dataset_id, include=["ACCESS_INFORMATION"]).to_dict()
        bq_project = response["access_information"]["big_query"]["project_id"]
        bq_schema = response["access_information"]["big_query"]["dataset_name"]
    except Exception as e:
        print("Error retrieving BQ project and schema: {}".format(str(e)))
    client = bigquery.Client()
    query = "SELECT datarepo_row_id FROM `{project}.{schema}.{table}`".format(project = bq_project, schema = bq_schema, table = table_name)
    try:
        query_job = client.query(query)
        results = [row["datarepo_row_id"] for row in query_job]
        return results
    except Exception as e:
        print("Error retrieving datarepo_row_id list: {}".format(str(e)))

for dataset_id in dataset_id_list:
    print(f"Processing record deletions for dataset {dataset_id}")
    for table in table_list:
        print(f"Processing record deletion for {table}")
        if delete_all_records:
            datarepo_row_ids = collect_all_datarepo_rows(dataset_id, table)
        else:
            datarepo_row_ids = delete_record_list
        if datarepo_row_ids:
            delete_datarepo_rows(dataset_id, table, datarepo_row_ids)
        else:
            print("No records specified for deletion.")


## Collect AnVIL Snapshots and Datasets

In [None]:
import requests

# Collect Anvil datasets and snapshots
creds, project = google.auth.default()
auth_req = google.auth.transport.requests.Request()
creds.refresh(auth_req)
api_client = utils.refresh_tdr_api_client()
datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
snapshots_api = data_repo_client.SnapshotsApi(api_client=api_client)
datasets_list = datasets_api.enumerate_datasets(filter="anvil", limit=2000)
records_list = []
for dataset_entry in datasets_list.items:
    if re.match("^ANVIL_[a-zA-Z0-9-_]+_[0-9]{8}", dataset_entry.name.upper()):
        dataset_detail = datasets_api.retrieve_dataset(id=dataset_entry.id)
        snapshots_list = snapshots_api.enumerate_snapshots(dataset_ids=[dataset_entry.id], limit=1000)
        if len(snapshots_list.items) == 0:
            record = [None, None, None, None, None, None, None, dataset_entry.id, dataset_entry.name, dataset_detail.ingest_service_account, dataset_entry.created_date[0:10]]
            records_list.append(record)
        else:
            for snapshot_entry in snapshots_list.items:
                public_flag = "N"
                public_response = requests.get(
                    url=f"https://sam.dsde-prod.broadinstitute.org/api/resources/v2/datasnapshot/{snapshot_entry.id}/policies/reader/public",
                    headers={"Authorization": f"Bearer {creds.token}"},
                )
                if public_response.text == "true":
                    public_flag = "Y"
                snapshot_policy_response = snapshots_api.retrieve_snapshot_policies(id=snapshot_entry.id)
                for role in snapshot_policy_response.policies:
                    if role.name == "reader":
                        readers = ", ".join(role.members)
                record = [snapshot_entry.id, snapshot_entry.name, snapshot_entry.data_project, snapshot_entry.created_date[0:10], snapshot_entry.created_date, public_flag, readers, dataset_entry.id, dataset_entry.name, dataset_detail.ingest_service_account, dataset_entry.created_date[0:10]]
                records_list.append(record)
df = pd.DataFrame(records_list, columns =["Snapshot ID", "Snapshot Name", "Snapshot Google Project", "Snapshot Created Date", "Snapshot Created Datetime", "Snapshot Public", "Snapshot Readers", "Source Dataset ID", "Source Dataset Name", "Source Dataset SA", "Source Dataset Created Date"])
df_sorted = df.sort_values(["Source Dataset Name", "Snapshot Name"], ascending=[True, True], ignore_index=True)
display(df_sorted)


## TDR Dataset and/or Snapshot Deletion

In [None]:
# Function to delete a specific TDR Snapshot
def delete_snapshot(snapshot_id):
    api_client = utils.refresh_tdr_api_client()
    snapshots_api = data_repo_client.SnapshotsApi(api_client=api_client)
    print("Attempting to delete snapshot = {}".format(snapshot_id))
    try:
        delete_snapshot_result, job_id = utils.wait_for_tdr_job(snapshots_api.delete_snapshot(id=snapshot_id))
        print("Result: {}".format(delete_snapshot_result))
    except Exception as e:
        print("Error: {}".format(e))

# Function to delete a specific TDR Dataset
def delete_dataset(dataset_id):
    api_client = utils.refresh_tdr_api_client()
    datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
    print("Attempting to delete dataset = {}".format(dataset_id))
    try:
        delete_dataset_result, job_id = utils.wait_for_tdr_job(datasets_api.delete_dataset(id=dataset_id))
        print("Result: {}".format(delete_dataset_result))
    except Exception as e:
        print("Error: {}".format(e))

# Function to delete a specific TDR Dataset and all of its Snapshots
def delete_dataset_and_all_snapshots(dataset_id):
    api_client = utils.refresh_tdr_api_client()
    datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
    snapshots_api = data_repo_client.SnapshotsApi(api_client=api_client)
    print("Attempting to delete dataset = {} and all associated snapshots".format(dataset_id))
    dataset_id_list = [dataset_id]
    # Delete snapshots
    snapshot_list = snapshots_api.enumerate_snapshots(dataset_ids=dataset_id_list)
    if snapshot_list.items:
        for snapshot in snapshot_list.items:
            snapshot_id = str(snapshot.id)
            delete_snapshot(snapshot_id)
            sleep(10)
    # Delete dataset
    delete_dataset(dataset_id)

# # Delete snapshots
# snapshot_id_list = [
# '3a827928-5c1d-4a60-8af2-abf3dd41bc5a',
# ]
# for snapshot_id in snapshot_id_list:
#     delete_snapshot(snapshot_id)

# Delete datasets and all their associated snapshots
dataset_id_list = [
'2f86bdd6-ba57-41d0-b22b-e951ac960fb3',
'2f47988f-d0fa-4019-8ea8-6e05fad0ef30',
'913b4373-6a9f-4a01-88d2-ab5290a055fe',
'1b3290fb-4be1-4558-9d66-92746f0f38d4',
'656c5884-b164-405e-a365-5a73bca8b000',
'95803570-3920-4ebc-8fa7-cf7aec78e4ed',
'6d26d395-c062-4f2a-ac59-1c8c26b5b911',
'e4840761-3d00-44f7-821d-7f2c512860f9',
'175cc5db-36b1-4de2-851b-85b9645cf685',
'a163a60c-fce0-4ac7-927e-929add996882',
'eee18026-0c04-4be4-9ab3-d707419face9',
'9ab57a66-950f-443b-813d-f91fb01b892d',
'576982f4-1581-4a81-88df-02c80ace0357',
'3bb7acd9-fef6-45ba-8441-b379b15c71ab',
'21844b71-fe26-4ccf-a2a4-56e1029b4bbb',
'a71a0f4b-c0f8-4256-ac46-03b38ecdf778',
'e29f9779-4014-45f9-9ae2-e71450711584',
'b6820544-7b20-4eca-b582-43bae83cceb8',
'63954db0-bc05-4298-bc07-c270c8eb03cc',
'c8edcc0e-76c0-45d6-bfb2-164054db59cc',
'e5d3e605-67a5-4317-b535-f75432700279',
'652cafc5-0131-48ea-b44d-3c59f1003eb2',
'357cfe22-6c6d-4377-a6fc-9feaef9735e0',
'8eff5518-fffb-4dce-bcf8-7f016300ee59',
'4e8b71f5-c5b4-4b88-baf2-edecc1535e45',
'027964c6-a26c-4eb1-9816-e02c5735cfe1',
'c70a9b17-dd6c-4154-989b-57bcbaa24f6d',
'2af7c1e1-7670-4c00-890f-524700ffcb80',
'6b6e2c3b-08da-48a1-bf07-9345d2a08802',
'b197a920-dc04-41f7-b35f-77d4dcbd5b19',
'329572b2-8826-419a-a948-66a2d7c0aa38',
'933c4de4-d48a-40c2-8fd6-c0b3e791a548',
'6892958e-4f7e-404b-8755-dbbfdbf04f26',
]
for dataset_id in dataset_id_list:
    delete_dataset_and_all_snapshots(dataset_id)

## Clean Up Outdated AnVIL TDR Service Accounts

In [None]:
valid_sa_list = [
]

# Establish credentials
creds, project = google.auth.default()
auth_req = google.auth.transport.requests.Request()
creds.refresh(auth_req)

# Get current anvil_tdr_ingest membership
group = "anvil_tdr_ingest"
group_members = requests.get(
    url=f"https://api.firecloud.org/api/groups/{group}",
    headers={"Authorization": f"Bearer {creds.token}"}
).json()

# Loop through anvil_tdr_ingest membership and remove outdated users
user_cnt = 0
success_cnt = 0
for member in group_members["membersEmails"]:
    if "tdr-ingest-sa" in member and member not in valid_sa_list:
        user_cnt += 1
        response = requests.delete(
            url=f"https://api.firecloud.org/api/groups/{group}/member/{member}",
            headers={"Authorization": f"Bearer {creds.token}"}
        )
        if response.status_code == 204:
            success_cnt += 1
print(f"Group ({group}) clean-up: ")
print(f"\t- Users to remove: {user_cnt}")
print(f"\t- Users removed successfully: {success_cnt}")

# Get current workspace membership
ws_members = requests.get(
    url=f"https://api.firecloud.org/api/workspaces/{ws_project}/{ws_name}/acl",
    headers={"Authorization": f"Bearer {creds.token}"}
).json()

# Loop through workspace membership and remove outdated users
user_cnt = 0
success_cnt = 0
for member in ws_members["acl"].keys():
    if "tdr-ingest-sa" in member and member not in valid_sa_list:
        user_cnt += 1
        payload = [{
            "email": member,
            "accessLevel": "NO ACCESS",
            "canShare": False,
            "canCompute": False
        }]
        response = requests.patch(
            url=f"https://api.firecloud.org/api/workspaces/{ws_project}/{ws_name}/acl",
            headers={"Authorization": f"Bearer {creds.token}"}, 
            json=payload
        )
        if response.status_code == 200:
            success_cnt += 1
print(f"Workspace ({ws_project}/{ws_name}) clean-up: ")
print(f"\t- Users to remove: {user_cnt}")
print(f"\t- Users removed successfully: {success_cnt}")

