# Imports

In [None]:
# Install additional modules (one time effort per cloud environment)
!pip install --upgrade import_ipynb data_repo_client urllib3 xmltodict

In [1]:
## imports and environment variables

# Imports
import import_ipynb
import pandas as pd
import os
import re
import json
import data_repo_client
from google.cloud import bigquery
import ingest_pipeline_utilities as utils
import build_file_inventory as bfi
import logging
from time import sleep
import datetime
from google.cloud import storage
import math

# Configure logging format
logging.basicConfig(format="%(asctime)s - %(levelname)s: %(message)s", datefmt="%m/%d/%Y %I:%M:%S %p", level=logging.INFO)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option("display.max_colwidth", None)
pd.set_option('display.width', 1000)
pd.set_option('display.colheader_justify', 'center')
pd.set_option('display.precision', 3)

# Environment variables
ws_name = os.environ["WORKSPACE_NAME"]
ws_project = os.environ["WORKSPACE_NAMESPACE"]
ws_bucket = os.environ["WORKSPACE_BUCKET"]
ws_bucket_name = re.sub('^gs://', '', ws_bucket)


importing Jupyter notebook from ingest_pipeline_utilities.ipynb
Version 1.0.26: 3/24/2023 8:31am - Nate Calvanese - Turned on the global file IDs snapshot creation parameter
importing Jupyter notebook from source_files_creation.ipynb
Version 1.0.9: 2/25/2023 3:15pm - Nate Calvanese - Replaced FAPI with utils functions
importing Jupyter notebook from build_file_inventory.ipynb
Version 2.0.1: 3/23/2022 8:29pm - Nate Calvanese - Added support for a global file exclusion
importing Jupyter notebook from process_table_data.ipynb
Version: 1.0.9: 3/8/2023 12:09pm - Nate Calvanese - Performance improvements for file ref lookups
importing Jupyter notebook from build_mapping_query.ipynb
Version 1.0.12: 2/21/2023 2:50pm - Nate Calvanese - Added support for $BQ_DATASET substitution variable
importing Jupyter notebook from output_data_validation.ipynb
Version 2.0.6: 2/28/2023 11:33am -- Updated notebook to be usable in dev (removed TDR host hardcoding)
importing Jupyter notebook from resolve_danglin

# Create new snapshot

## Script to create new full view snapshot

In [None]:
# Parameters
params = {}
params["profile_id"] = "e0e03e48-5b96-45ec-baa4-8cc1ebf74c61"
params["snapshot_readers_list"] = ["azul-anvil-prod@firecloud.org"]
params["anvil_schema_version"] = "ANV5"

# Loop through datasets and create new snapshot
dataset_id_run_list = [
    'ee4a96ab-ffe0-4fd6-bb69-2921f6e944d0',
]
results = []
for dataset in dataset_id_run_list:
    dataset_id = dataset
    try:
        api_client = utils.refresh_tdr_api_client()
        datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
        dataset_info = datasets_api.retrieve_dataset(id=dataset_id, include=["SCHEMA", "ACCESS_INFORMATION", "PROPERTIES"]).to_dict()
        dataset_name = dataset_info["name"]
        phs_id = dataset_info["phs_id"]
        consent_name = dataset_info["properties"]["consent_name"]
        auth_domains = dataset_info["properties"]["auth_domains"]
        src_workspaces = dataset_info["properties"]["source_workspaces"]
    except:
        dataset_name = ""
    if dataset_name:
        params["ws_bucket"] = ws_bucket
        params["dataset_id"] = dataset_id
        params["dataset_name"] = dataset_name
        params["phs_id"] = phs_id
        params["consent_name"] = consent_name
        params["auth_domains"] = auth_domains
        params["pipeline_results"] = []
        current_datetime = datetime.datetime.now()
        current_datetime_string = current_datetime.strftime("%Y%m%d%H%M")
        params["snapshot_name"] = params["dataset_name"] + "_" + params["anvil_schema_version"] + "_" + current_datetime_string 
        utils.create_and_share_snapshot(params)
        int_df_results = pd.DataFrame(params["pipeline_results"], columns = ["Dataset", "Time", "Step", "Task", "Status", "Message"])
        errors = int_df_results[int_df_results["Status"].str.contains("Error")]
        if len(errors) > 0:
            results.append([dataset_id, "Error", ""])
        else:
            snapshot_id = re.search("{'id': '([a-z0-9]{8}-[a-z0-9]{4}-[a-z0-9]{4}-[a-z0-9]{4}-[a-z0-9]{12})'", str(int_df_results[int_df_results["Task"]=="Create and Share Snapshot"]["Message"]))[1]
            results.append([dataset_id, "Success", snapshot_id])
results_df = pd.DataFrame(results, columns = ["dataset_id", "run_status", "snapshot_id"])
display(results_df)


# Add and populate anvil_file.is_supplementary

## Script to patch dataset

In [None]:
def add_and_populate_supp_file_flg(dataset_id):
    logging.info(f"Processing anvil_file.is_supplementary for Dataset ID = {dataset_id}")
    
    # Retrieve dataset information
    logging.info("Retrieving necessary information from TDR.")
    src_schema_dict = {}
    api_client = utils.refresh_tdr_api_client()
    datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
    try:
        response = datasets_api.retrieve_dataset(id=dataset_id, include=["SCHEMA", "ACCESS_INFORMATION"]).to_dict()
        src_schema_dict["tables"] = response["schema"]["tables"]
        bq_project = response["access_information"]["big_query"]["project_id"]
        bq_dataset = response["access_information"]["big_query"]["dataset_name"]
    except Exception as e:
        logging.error("Error retrieving information from TDR. Exiting function. Error: {}".format(e))
        return "Failure"
    
    # Determine if field needs to be added, and add if so
    logging.info("Adding anvil_file.is_supplementary to dataset schema, if necessary.")
    field_found = False
    for table in src_schema_dict["tables"]:
        if table["name"] == "anvil_file":
            for col in table["columns"]:
                if col["name"] == "is_supplementary":
                    field_found = True
                    logging.info("Field already found! Skipping schema update.")
                    break
            break
    if field_found == False:
        logging.info("Field not found. Running dataset schema update.")
        schema_update_request = {
            "description": "Adding is_supplementary column to anvil_file",
            "changes": {
                "addColumns": [
                  {
                    "tableName": "anvil_file",
                    "columns": [
                      {
                        "name": "is_supplementary",
                        "datatype": "boolean",
                        "array_of": False,
                        "required": False
                      }
                    ]
                  }
                ]
            }
        }
        attempt_counter = 0
        while True:
            try:
                schema_update_result, job_id = utils.wait_for_tdr_job(datasets_api.update_schema(id=dataset_id, dataset_schema_update_model=schema_update_request))
                logging.info("Dataset schema update succeeded!")
                break
            except Exception as e:
                logging.error("Error on dataset schema update: {}".format(str(e)))
                attempt_counter += 1
                if attempt_counter < 2:
                    logging.info("Retrying dataset schema update (attempt #{})...".format(str(attempt_counter)))
                    sleep(15)
                    continue
                else:
                    logging.error("Maximum number of retries exceeded. Unable to update dataset schema. Exiting function.")
                    return "Failure"
        
    # Re-process anvil_file data to include is_supplementary (where appropriate) and ingest into TDR dataset (as replace)
    logging.info("Re-processing existing anvil_file data to include is_supplementary value.")
    client = bigquery.Client()
    target_file = "anvil_file.json"
    destination_dir = "ingest_pipeline/output/transformed/anvil/{}/table_data".format(dataset_id)
    query = """BEGIN
        
        CREATE TEMPORARY TABLE activity_exp AS WITH activity_agg
        AS
        (
          SELECT used_biosample_id, generated_file_id, used_file_id FROM `{project}.{dataset}.anvil_activity`
          UNION ALL 
          SELECT [] AS used_biosample_id, generated_file_id, used_file_id FROM `{project}.{dataset}.anvil_alignmentactivity`
          UNION ALL 
          SELECT used_biosample_id, generated_file_id, [] AS used_file_id FROM `{project}.{dataset}.anvil_assayactivity`
          UNION ALL 
          SELECT used_biosample_id, generated_file_id, [] AS used_file_id FROM `{project}.{dataset}.anvil_sequencingactivity`
          UNION ALL 
          SELECT [] AS used_biosample_id, generated_file_id, used_file_id FROM `{project}.{dataset}.anvil_variantcallingactivity`
        )
        SELECT file_id, int_file_id, biosample_id
        FROM activity_agg
            LEFT JOIN UNNEST(used_biosample_id) AS biosample_id
            LEFT JOIN UNNEST(generated_file_id) as file_id
            LEFT JOIN UNNEST(used_file_id) as int_file_id
        ;
        
        CREATE TEMPORARY TABLE act_exp_lookup
        AS
        (
            SELECT file_id, MAX(biosample_id) AS biosample_id
          FROM
          (
            --Level 1:
            SELECT file_id, biosample_id
            FROM activity_exp
            WHERE int_file_id IS NULL AND file_id IS NOT NULl AND biosample_id IS NOT NULL
            --Level 2:
            UNION ALL
            SELECT a2.file_id, a1.biosample_id
            FROM activity_exp a1
              LEFT JOIN activity_exp a2
              ON a1.file_id = a2.int_file_id
            WHERE a2.int_file_id IS NOT NULL AND a2.file_id IS NOT NULL AND a1.biosample_id IS NOT NULL
            --Level 3:
            UNION ALL
            SELECT a3.file_id, a1.biosample_id
            FROM activity_exp a1
              LEFT JOIN activity_exp a2
              ON a1.file_id = a2.int_file_id
              LEFT JOIN activity_exp a3
              ON a2.file_id = a3.int_file_id
            WHERE a3.int_file_id IS NOT NULL AND a3.file_id IS NOT NULL AND a1.biosample_id IS NOT NULL
            --Level 4:
            UNION ALL
            SELECT a4.file_id, a1.biosample_id
            FROM activity_exp a1
              LEFT JOIN activity_exp a2
              ON a1.file_id = a2.int_file_id
              LEFT JOIN activity_exp a3
              ON a2.file_id = a3.int_file_id
              LEFT JOIN activity_exp a4
              ON a3.file_id = a4.int_file_id
            WHERE a4.int_file_id IS NOT NULL AND a4.file_id IS NOT NULL AND a1.biosample_id IS NOT NULL
            --Level 5:
            UNION ALL
            SELECT a5.file_id, a1.biosample_id
            FROM activity_exp a1
              LEFT JOIN activity_exp a2
              ON a1.file_id = a2.int_file_id
              LEFT JOIN activity_exp a3
              ON a2.file_id = a3.int_file_id
              LEFT JOIN activity_exp a4
              ON a3.file_id = a4.int_file_id
              LEFT JOIN activity_exp a5
              ON a4.file_id = a5.int_file_id
            WHERE a5.int_file_id IS NOT NULL AND a5.file_id IS NOT NULL AND a1.biosample_id IS NOT NULL
          )
          GROUP BY file_id
        );
        
        SELECT t1.file_id, data_modality, file_format, file_size, file_md5sum, reference_assembly, file_name, file_ref, source_datarepo_row_ids,
        CASE WHEN t2.biosample_id IS NULL THEN TRUE ELSE FALSE END AS is_supplementary
        FROM `{project}.{dataset}.anvil_file` t1
          LEFT JOIN act_exp_lookup t2
          ON t1.file_id = t2.file_id
        ;
        
        END
        """.format(project=bq_project, dataset=bq_dataset)
    try:
        df = client.query(query).result().to_dataframe()
        records_json = df.to_json(orient='records') 
        records_list = json.loads(records_json)
        records_cnt = len(records_list)
        with open(target_file, 'w') as outfile:
            for idx, val in enumerate(records_list):
                json.dump(val, outfile)
                if idx < (records_cnt - 1):
                    outfile.write('\n')
        !gsutil cp $target_file $ws_bucket/$destination_dir/ 2> stdout
        !rm $target_file
        logging.info("Successfully created new anvil_file.json file.")
    except Exception as e:
        logging.error("Error creating new anvil_file.json file. Exiting function. Error: {}".format(str(e)))
        return "Failure"

    # Ingest updated anvil_file data
    logging.info("Ingesting updated anvil_file data into TDR dataset.")
    source_full_file_path = "{}/{}/{}".format(ws_bucket, destination_dir, "anvil_file.json")
    ingest_request = {
        "table": "anvil_file",
        "profile_id": "e0e03e48-5b96-45ec-baa4-8cc1ebf74c61",
        "ignore_unknown_values": True,
        "resolve_existing_files": True,
        "updateStrategy": "replace",
        "format": "json",
        "load_tag": "Ingest for {}".format(dataset_id),
        "path": source_full_file_path
    }
    attempt_counter = 0
    while True:
        try:
            api_client = utils.refresh_tdr_api_client()
            datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
            ingest_request_result, job_id = utils.wait_for_tdr_job(datasets_api.ingest_dataset(id=dataset_id, ingest=ingest_request))
            logging.info("Ingest from file anvil_file.json succeeded: {}".format(str(ingest_request_result)[0:1000]))
            break
        except Exception as e:
            logging.error("Error on Dataset Ingest: {}".format(str(e)))
            attempt_counter += 1
            if attempt_counter < 2:
                logging.info("Retrying Dataset Ingest (attempt #{})...".format(str(attempt_counter)))
                sleep(10)
                continue
            else:
                logging.error("Maximum number of retries exceeded. Exiting function.")
                return "Failure"
    
    # Return success message if no failures recorded
    logging.info("Function completed successfully.")
    return "Success"

# Loop through datasets and process is_supplementary field
dataset_id_list = [
'd74b26d5-24bb-4696-84c3-bcd1f5f90b08',
]
results = []
for dataset_id in dataset_id_list:
    status = add_and_populate_supp_file_flg(dataset_id) 
    results.append([dataset_id, status])
    results_df = pd.DataFrame(results, columns = ["dataset_id", "run_status"])
display(results_df)
    

## Script to validate patch worked properly

In [None]:
def validate_supp_file_flg(dataset_id):
    
    # Retrieve dataset information
    src_schema_dict = {}
    api_client = utils.refresh_tdr_api_client()
    datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
    try:
        response = datasets_api.retrieve_dataset(id=dataset_id, include=["SCHEMA", "ACCESS_INFORMATION"]).to_dict()
        src_schema_dict["tables"] = response["schema"]["tables"]
        bq_project = response["access_information"]["big_query"]["project_id"]
        bq_dataset = response["access_information"]["big_query"]["dataset_name"]
    except Exception as e:
        return "Failure - Issue Retrieving Dataset Info"
    
    # Determine if field exists for dataset, continue if so, fail otherwise
    field_found = False
    for table in src_schema_dict["tables"]:
        if table["name"] == "anvil_file":
            for col in table["columns"]:
                if col["name"] == "is_supplementary":
                    field_found = True
                    break
            break
    if field_found == False:
        return "Failure - is_supplementary field not found"
    else:
        client = bigquery.Client()
        query = """SELECT COUNT(*) AS rec_cnt, COUNT(is_supplementary) AS populated_cnt
                    FROM `{project}.{dataset}.anvil_file`""".format(project=bq_project, dataset=bq_dataset)
        try:
            df = client.query(query).result().to_dataframe()
            if df["rec_cnt"].values[0] == df["populated_cnt"].values[0]:
                return "Success"
        except Exception as e:
            return "Failure - BigQuery Error"

# Loop through datasets and validate is_supplementary field
dataset_id_list = [
'd74b26d5-24bb-4696-84c3-bcd1f5f90b08',
]
results = []
for dataset_id in dataset_id_list:
    status = validate_supp_file_flg(dataset_id) 
    results.append([dataset_id, status])
    results_df = pd.DataFrame(results, columns = ["dataset_id", "validation_status"])
display(results_df)

# Attempt to populate anvil_donor.organism_type

## Script to patch dataset

In [None]:
def populate_organism_type(dataset_id):
    logging.info(f"Processing anvil_donor.organism_type for Dataset ID = {dataset_id}")
    
    # Retrieve dataset information
    logging.info("Retrieving necessary information from TDR.")
    api_client = utils.refresh_tdr_api_client()
    datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
    try:
        response = datasets_api.retrieve_dataset(id=dataset_id, include=["ACCESS_INFORMATION"]).to_dict()
        bq_project = response["access_information"]["big_query"]["project_id"]
        bq_dataset = response["access_information"]["big_query"]["dataset_name"]
    except Exception as e:
        logging.error("Error retrieving information from TDR. Exiting function. Error: {}".format(e))
        return "Failure"

    # Re-process anvil_donor data to include organism_type (where available)
    logging.info("Re-processing existing anvil_donor data to include organism_type value.")
    client = bigquery.Client()
    target_file = "anvil_donor.json"
    destination_dir = "ingest_pipeline/output/transformed/anvil/{}/table_data".format(dataset_id)
    query = """SELECT donor_id, 
    (SELECT MAX(CASE WHEN REGEXP_CONTAINS(value, '(h37|h38|h39|hg16|hg17|hg18|hg19|hs37|hs38|b37)') THEN 'Homo sapiens' END) AS organism_type FROM `{project}.{dataset}.workspace_attributes` WHERE attribute = 'library:reference') AS organism_type,
    part_of_dataset_id, phenotypic_sex, reported_ethnicity, genetic_ancestry, source_datarepo_row_ids
    FROM `{project}.{dataset}.anvil_donor`""".format(project=bq_project, dataset=bq_dataset)
    try:
        df = client.query(query).result().to_dataframe()
        records_json = df.to_json(orient='records') 
        records_list = json.loads(records_json)
        records_cnt = len(records_list)
        with open(target_file, 'w') as outfile:
            for idx, val in enumerate(records_list):
                json.dump(val, outfile)
                if idx < (records_cnt - 1):
                    outfile.write('\n')
        !gsutil cp $target_file $ws_bucket/$destination_dir/ 2> stdout
        !rm $target_file
        logging.info("Successfully created new anvil_donor.json file.")
    except Exception as e:
        logging.error("Error creating new anvil_donor.json file. Exiting function. Error: {}".format(str(e)))
        return "Failure"

    # Ingest updated anvil_donor data
    logging.info("Ingesting updated anvil_donor data into TDR dataset.")
    source_full_file_path = "{}/{}/{}".format(ws_bucket, destination_dir, "anvil_donor.json")
    ingest_request = {
        "table": "anvil_donor",
        "profile_id": "e0e03e48-5b96-45ec-baa4-8cc1ebf74c61",
        "ignore_unknown_values": True,
        "resolve_existing_files": True,
        "updateStrategy": "replace",
        "format": "json",
        "load_tag": "Ingest for {}".format(dataset_id),
        "path": source_full_file_path
    }
    attempt_counter = 0
    while True:
        try:
            api_client = utils.refresh_tdr_api_client()
            datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
            ingest_request_result, job_id = utils.wait_for_tdr_job(datasets_api.ingest_dataset(id=dataset_id, ingest=ingest_request))
            logging.info("Ingest from file anvil_donor.json succeeded: {}".format(str(ingest_request_result)[0:1000]))
            break
        except Exception as e:
            logging.error("Error on Dataset Ingest: {}".format(str(e)))
            attempt_counter += 1
            if attempt_counter < 2:
                logging.info("Retrying Dataset Ingest (attempt #{})...".format(str(attempt_counter)))
                sleep(10)
                continue
            else:
                logging.error("Maximum number of retries exceeded. Exiting function.")
                return "Failure"

    # Return success message if no failures recorded
    logging.info("Function completed successfully.")
    return "Success"

# Loop through datasets and process supplementary_file_flag
dataset_id_list = [
'd74b26d5-24bb-4696-84c3-bcd1f5f90b08',
]
results = []
for dataset_id in dataset_id_list:
    status = populate_organism_type(dataset_id) 
    results.append([dataset_id, status])
    results_df = pd.DataFrame(results, columns = ["dataset_id", "run_status"])
display(results_df)


## Script to examine organism_type population

In [None]:
def validate_organism_type(dataset_id):
    
    # Retrieve dataset information
    src_schema_dict = {}
    api_client = utils.refresh_tdr_api_client()
    datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
    try:
        response = datasets_api.retrieve_dataset(id=dataset_id, include=["SCHEMA", "ACCESS_INFORMATION"]).to_dict()
        src_schema_dict["tables"] = response["schema"]["tables"]
        bq_project = response["access_information"]["big_query"]["project_id"]
        bq_dataset = response["access_information"]["big_query"]["dataset_name"]
    except Exception as e:
        return "Failure - Issue Retrieving Dataset Info"
    
    # Determine if field exists for dataset, continue if so, fail otherwise
    client = bigquery.Client()
    query = """SELECT COUNT(organism_type) AS populated_cnt
                FROM `{project}.{dataset}.anvil_donor`""".format(project=bq_project, dataset=bq_dataset)
    try:
        df = client.query(query).result().to_dataframe()
        if df["populated_cnt"].values[0] > 0:
            return "Success - Field Populated"
        else:
            return "Success - Field Not Populated"
    except Exception as e:
        return "Failure - BigQuery Error"

# Loop through datasets and validate is_supplementary field
dataset_id_list = [
'd74b26d5-24bb-4696-84c3-bcd1f5f90b08',
]
results = []
for dataset_id in dataset_id_list:
    status = validate_organism_type(dataset_id) 
    results.append([dataset_id, status])
    results_df = pd.DataFrame(results, columns = ["dataset_id", "validation_status"])
display(results_df)

# Update references to md5-added files

In [2]:
# Function to collect all datarepo rows for a particular table within a dataset
def collect_all_datarepo_rows(dataset_id, table_name):
    api_client = utils.refresh_tdr_api_client()
    datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
    try:
        response = datasets_api.retrieve_dataset(id=dataset_id, include=["ACCESS_INFORMATION"]).to_dict()
        bq_project = response["access_information"]["big_query"]["project_id"]
        bq_schema = response["access_information"]["big_query"]["dataset_name"]
    except Exception as e:
        logging.error("Error retrieving BQ project and schema: {}".format(str(e)))
    client = bigquery.Client()
    query = "SELECT datarepo_row_id FROM `{project}.{schema}.{table}`".format(project = bq_project, schema = bq_schema, table = table_name)
    try:
        query_job = client.query(query)
        results = [row["datarepo_row_id"] for row in query_job]
        return results
    except Exception as e:
        logging.error("Error retrieving datarepo_row_id list: {}".format(str(e)))
        raise Exception(e)

# Function to delete rows from a dataset
def delete_old_records(dataset_id, table, datarepo_row_ids):
    logging.info(f"Attempting to delete original {table} records.")
    if datarepo_row_ids:
        data_deletion_payload = {
            "deleteType": "soft",
            "specType": "jsonArray",
            "tables": [{
              "tableName": table,
              "jsonArraySpec": {
                "rowIds": datarepo_row_ids
              }
            }]
        }
        api_client = utils.refresh_tdr_api_client()
        datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
        try:
            data_deletion_result, job_id = utils.wait_for_tdr_job(datasets_api.apply_dataset_data_deletion(id=dataset_id, data_deletion_request=data_deletion_payload))
            logging.info("Result: {}".format(data_deletion_result))
        except Exception as e:
            logging.info("Error: {}".format(str(e)))
            raise Exception(e)
    else:
        logging.info("No datarepo_row_ids specified for deletion.")

def ingest_updated_records(profile_id, dataset_id, table, records_dict):
    logging.info(f"Submitting ingest for updated {table} records.")
    
    # Build, submit, and monitor ingest request
    ingest_request = {
        "table": table,
        "profile_id": profile_id,
        "ignore_unknown_values": True,
        "resolve_existing_files": True,
        "updateStrategy": "replace",
        "format": "array",
        "bulkMode": False,
        "load_tag": f"File ref fields patch for {table} in {dataset_id}",
        "records": records_dict
    }
    attempt_counter = 0
    while True:
        try:
            api_client = utils.refresh_tdr_api_client()
            datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
            ingest_request_result, job_id = utils.wait_for_tdr_job(datasets_api.ingest_dataset(id=dataset_id, ingest=ingest_request))
            logging.info("Ingest succeeded: {}".format(str(ingest_request_result)[0:1000]))
            status = "Success"
            return
        except Exception as e:
            logging.error("Error on ingest: {}".format(str(e)))
            attempt_counter += 1
            if attempt_counter < 1:
                logging.info("Retrying ingest (attempt #{})...".format(str(attempt_counter)))
                sleep(10)
                continue
            else:
                logging.error("Maximum number of retries exceeded. Logging error.")
                status = "Error"
                raise Exception(e)
                
def update_recs_w_file_refs(dataset_id):
    logging.info(f"Processing md5-added files for Dataset ID = {dataset_id}")

    ## Retrieve dataset information
    logging.info("Retrieving necessary information from TDR.")
    src_schema_dict = {}
    api_client = utils.refresh_tdr_api_client()
    datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
    try:
        response = datasets_api.retrieve_dataset(id=dataset_id, include=["SCHEMA", "ACCESS_INFORMATION"]).to_dict()
        src_schema_dict["tables"] = response["schema"]["tables"]
        bq_project = response["access_information"]["big_query"]["project_id"]
        bq_dataset = response["access_information"]["big_query"]["dataset_name"]
    except Exception as e:
        logging.error("Error retrieving information from TDR. Exiting function. Error: {}".format(e))
        return "Failure - Pre-processing"

    ## Parse TDR schema to identify file reference fields
    table_dict = {}
    for table in src_schema_dict["tables"]:
        if table["name"] in ["file_inventory", "anvil_file"]:
            continue
        else:
            col_list = []
            for column in table["columns"]:
                if column["datatype"] == "fileref":
                    col_list.append([column["name"], column["array_of"]])
            if col_list:
                table_dict[table["name"]] = col_list

    ## Loop through tables and re-process impacted records
    for table in table_dict.keys():
        logging.info(f"Processing updates for {table}.")
        # Retrieve relevant records from BigQuery
        col_list = []
        old_cols = ""
        new_cols = ""
        join_clause = ""
        where_clause = ""
        for idx, col in enumerate(table_dict[table]):
            column_name = col[0]
            col_list.append(column_name)
            if idx == 0: 
                old_cols += column_name
                where_clause += f"t.{column_name} IN (SELECT file_ref FROM file_list)"
            else:
                old_cols += ", " + column_name
                where_clause += f" OR t.{column_name} IN (SELECT file_ref FROM file_list)"
            new_cols += f", CASE WHEN t{idx}.source_name IS NOT NULL THEN TO_JSON(STRUCT(t{idx}.source_name AS sourcePath, t{idx}.target_path AS targetPath)) END AS {column_name}"
            join_clause += f" LEFT JOIN load_hist t{idx} ON t.{column_name} = t{idx}.file_id"

        query = """WITH 
            file_list AS (SELECT * FROM `{project}.{dataset}.file_inventory` WHERE md5_hash IS NULL),
            load_hist AS (SELECT * FROM `{project}.{dataset}.datarepo_load_history` WHERE state = 'succeeded')
            SELECT t.* EXCEPT({old_cols}){new_cols}
            FROM `{project}.{dataset}.{table}` t {joins} WHERE {where}""".format(project=bq_project, dataset=bq_dataset, table=table, old_cols=old_cols, new_cols=new_cols, joins=join_clause, where=where_clause)
        try:
            client = bigquery.Client()
            res = client.query(query).result()
            if res.total_rows > 0:
                logging.info(f"{res.total_rows} records to process.")
                df = res.to_dataframe()
                records_json = df.to_json(orient='records')
                records_list = json.loads(records_json)
            else:
                logging.info("No records to process.")
                records_list = []
        except Exception as e:
            logging.error(f"Error retrieving update records from BigQuery: {str(e)}")
            return "Failure - Table Processing"
        # Ingest updated records back to TDR dataset
        try:
            datarepo_row_ids = []
            for record in records_list:
                datarepo_row_ids.append(record.pop("datarepo_row_id", None))
                for col in col_list:
                    record[col] = json.loads(record[col])
            if records_list:
                ingest_updated_records("e0e03e48-5b96-45ec-baa4-8cc1ebf74c61", dataset_id, table, records_list)
                delete_old_records(dataset_id, table, datarepo_row_ids)
        except Exception as e:
            logging.error(f"Error replacing TDR records: {str(e)}")
            return "Failure - Table Processing"
        
    ## Re-process file_inventory
    logging.info(f"Processing updates for file_inventory.")
    # Retrieve relevant records from BigQuery
    query = """WITH 
        file_list AS (SELECT file_ref FROM `{project}.{dataset}.file_inventory` WHERE md5_hash IS NULL),
        load_hist AS (SELECT * FROM `{project}.{dataset}.datarepo_load_history` WHERE state = 'succeeded')
        SELECT t1.*, CASE WHEN t2.source_name IS NOT NULL THEN TO_JSON(STRUCT(t2.source_name AS sourcePath, t2.target_path AS targetPath)) END AS file_ref
        FROM `{project}.{dataset}.file_inventory` t1
          INNER JOIN load_hist t2 ON t1.file_ref = t2.file_id
        WHERE file_ref IN (SELECT file_ref FROM file_list)""".format(project=bq_project, dataset=bq_dataset)
    try:
        client = bigquery.Client()
        res = client.query(query).result()
        if res.total_rows > 0:
            logging.info(f"{res.total_rows} records to process.")
            df = res.to_dataframe()
            records_json = df.to_json(orient='records')
            records_list = json.loads(records_json)
        else:
            logging.info("No records to process.")
            records_list = []
    except Exception as e:
        logging.error(f"Error retrieving update records from BigQuery: {str(e)}")
        return "Failure - File Inventory Processing"
    # Loop through records and update md5_hash from GCS metadata
    try:
        storage_client = storage.Client()
        datarepo_row_ids = []
        for record in records_list:
            bucket = re.match('gs:\/\/([a-z0-9\-]+)', record["uri"]).group(1)
            obj = re.match('gs:\/\/[a-z0-9\-]+\/([A-Za-z0-9\-_\/\.]+)', record["uri"]).group(1)
            bucket = storage_client.bucket(bucket, user_project="anvil-datastorage")
            blob = bucket.get_blob(obj)
            record["md5_hash"] = blob.md5_hash
            datarepo_row_ids.append(record.pop("datarepo_row_id", None))
    except Exception as e:
        logging.error(f"Error retrieving file metadata from GCS: {str(e)}")
        return "Failure - File Inventory Processing"
    # Ingest updated records back to TDR dataset
    try:
        if records_list:
            ingest_updated_records("e0e03e48-5b96-45ec-baa4-8cc1ebf74c61", dataset_id, "file_inventory", records_list)
            delete_old_records(dataset_id, "file_inventory", datarepo_row_ids)         
    except Exception as e:
        logging.error(f"Error replacing TDR records: {str(e)}")
        return "Failure - File Inventory Processing"

    ## Empty anvil_% tables
    logging.info("Clearing out existing anvil_% tables")
    table_list = ["anvil_activity", "anvil_alignmentactivity", "anvil_antibody", "anvil_assayactivity", "anvil_biosample", "anvil_diagnosis", "anvil_donor", "anvil_file", "anvil_sequencingactivity", "anvil_variantcallingactivity"]
    for table in table_list:
        try:
            datarepo_row_ids = collect_all_datarepo_rows(dataset_id, table)
            if datarepo_row_ids:
                delete_old_records(dataset_id, table, datarepo_row_ids)
        except Exception as e:
            logging.error(f"Error clearing out existing anvil_% records: {str(e)}")
            return "Failure - anvil_% Record Deletion"
    
    ## Re-run T pipeline without validation
    params = {}
    params["ws_name"] = ws_name
    params["ws_project"] = ws_project
    params["ws_bucket"] = ws_bucket
    params["ws_bucket_name"] = ws_bucket_name
    params["profile_id"] = "e0e03e48-5b96-45ec-baa4-8cc1ebf74c61" 
    params["mapping_target"] = "anvil"
    params["skip_transforms"] = False
    params["transform_list_override"] = [] # Leave empty to run transforms for all files, otherwise populate with target table names 
    params["skip_schema_extension"] = False
    params["skip_ingests"] = False
    params["ingest_list_override"] = [] # Leave empty to run ingests for all files, otherwise populate with target table names
    params["skip_file_relation_inference"] = False
    params["skip_dangling_fk_resolution"] = False
    params["skip_supplementary_file_identification"] = False
    params["skip_snapshot_creation"] = False
    params["snapshot_readers_list"] = ["azul-anvil-prod@firecloud.org"] # Include "auth-domain" to add the auth domain(s) as a reader (if one exists)
    params["skip_data_validation"] = True
    try:
        api_client = utils.refresh_tdr_api_client()
        datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
        dataset_info = datasets_api.retrieve_dataset(id=dataset_id, include=["SCHEMA", "ACCESS_INFORMATION", "PROPERTIES"]).to_dict()
        dataset_name = dataset_info["name"]
        phs_id = dataset_info["phs_id"]
        consent_name = dataset_info["properties"]["consent_name"]
        auth_domains = dataset_info["properties"]["auth_domains"]
        src_workspaces = dataset_info["properties"]["source_workspaces"]
    except:
        dataset_name = ""
        return "Failure - Dataset Retrieval for T Pipeline"
    if dataset_name:
        params["dataset_id"] = dataset_id
        params["dataset_name"] = dataset_name
        params["phs_id"] = phs_id
        params["consent_name"] = consent_name
        params["auth_domains"] = auth_domains
        utils.run_t_pipeline(params)
    
    # Return success message if no failures recorded
    logging.info("Function completed successfully.")
    return "Success"

# Loop through datasets and process md5 updates
dataset_id_list = [
'700303c2-fcef-48a5-9900-096bf34e2d83',
'a715c70d-da92-43ee-a851-1a27277909a2',
]
results = []
for dataset_id in dataset_id_list:
    status = update_recs_w_file_refs(dataset_id) 
    results.append([dataset_id, status])
    results_df = pd.DataFrame(results, columns = ["dataset_id", "run_status"])
display(results_df)


03/31/2023 01:41:51 PM - INFO: Processing md5-added files for Dataset ID = 700303c2-fcef-48a5-9900-096bf34e2d83
03/31/2023 01:41:51 PM - INFO: Retrieving necessary information from TDR.
03/31/2023 01:41:51 PM - INFO: Processing updates for sample.
03/31/2023 01:41:54 PM - INFO: No records to process.
03/31/2023 01:41:54 PM - INFO: Processing updates for file_inventory.
03/31/2023 01:41:56 PM - INFO: 1 records to process.
03/31/2023 01:41:57 PM - INFO: Submitting ingest for updated file_inventory records.
TDR Job ID: Qf0n906NT1OCnyEMLf0mtg
03/31/2023 01:42:28 PM - INFO: Ingest succeeded: {'dataset_id': '700303c2-fcef-48a5-9900-096bf34e2d83', 'dataset': 'ANVIL_CCDG_Broad_MI_BRAVE_GRU_WES_20221107', 'table': 'file_inventory', 'path': None, 'load_tag': 'File ref fields patch for file_inventory in 700303c2-fcef-48a5-9900-096bf34e2d83', 'row_count': 1, 'bad_row_count': 0, 'load_result': None}
03/31/2023 01:42:28 PM - INFO: Attempting to delete original file_inventory records.
TDR Job ID: ykD

TDR Job ID: RgWm3B9lTZahjbyVkSG44w
03/31/2023 01:47:37 PM - INFO: File relationships inference ingest succeeded: {'dataset_id': '700303c2-fcef-48a5-9900-096bf34e2d83', 'dataset': 'ANVIL_CCDG_Broad_MI_BRAVE_GRU_WES_20221107', 'table': 'anvil_activity', 'path': None, 'load_tag': 'File relationships inference ingest for 700303c2-fcef-48a5-9900-096bf34e2d83', 'row_count': 2993, 'bad_row_count': 0, 'load_result': None}
03/31/2023 01:47:37 PM - INFO: Running dangling foreign key resolution.
03/31/2023 01:47:37 PM - INFO: Attempting to identify the TDR object, and collect and parse its schema...
03/31/2023 01:47:38 PM - INFO: Attempting to identify and remediate dangling foreign keys...
03/31/2023 01:47:38 PM - INFO: Identifying dangling foreign keys for anvil_donor...
03/31/2023 01:47:39 PM - INFO: Identifying dangling foreign keys for anvil_antibody...
03/31/2023 01:47:41 PM - INFO: Identifying dangling foreign keys for anvil_biosample...
03/31/2023 01:47:43 PM - INFO: Identifying dangling 

Unnamed: 0,Dataset,Time,Step,Task,Status,Message
0,ANVIL_CCDG_Broad_MI_BRAVE_GRU_WES_20221107 (700303c2-fcef-48a5-9900-096bf34e2d83),2023-03-31 13:43:34,Transform Artifact Retrieval,Confirm Transform Artifact Retrieval,Success,
1,ANVIL_CCDG_Broad_MI_BRAVE_GRU_WES_20221107 (700303c2-fcef-48a5-9900-096bf34e2d83),2023-03-31 13:43:45,Transformed Files Creation,File: anvil_activity.json,Success,
2,ANVIL_CCDG_Broad_MI_BRAVE_GRU_WES_20221107 (700303c2-fcef-48a5-9900-096bf34e2d83),2023-03-31 13:43:53,Transformed Files Creation,File: anvil_biosample.json,Success,
3,ANVIL_CCDG_Broad_MI_BRAVE_GRU_WES_20221107 (700303c2-fcef-48a5-9900-096bf34e2d83),2023-03-31 13:43:58,Transformed Files Creation,File: anvil_dataset.json,Success,
4,ANVIL_CCDG_Broad_MI_BRAVE_GRU_WES_20221107 (700303c2-fcef-48a5-9900-096bf34e2d83),2023-03-31 13:44:05,Transformed Files Creation,File: anvil_donor.json,Success,
5,ANVIL_CCDG_Broad_MI_BRAVE_GRU_WES_20221107 (700303c2-fcef-48a5-9900-096bf34e2d83),2023-03-31 13:44:12,Transformed Files Creation,File: anvil_file.json,Success,
6,ANVIL_CCDG_Broad_MI_BRAVE_GRU_WES_20221107 (700303c2-fcef-48a5-9900-096bf34e2d83),2023-03-31 13:44:18,Transformed Files Creation,File: anvil_project.json,Success,
7,ANVIL_CCDG_Broad_MI_BRAVE_GRU_WES_20221107 (700303c2-fcef-48a5-9900-096bf34e2d83),2023-03-31 13:44:18,TDR Schema Extension,Extend TDR Schema,Success,No new tables or relationships to add to the TDR schema.
8,ANVIL_CCDG_Broad_MI_BRAVE_GRU_WES_20221107 (700303c2-fcef-48a5-9900-096bf34e2d83),2023-03-31 13:44:39,Dataset Ingests,Table: anvil_activity - File: anvil_activity.json,Success,"Job_ID: iflXmnzUQsyo3igv9uipjQ - Truncated Response: {'dataset_id': '700303c2-fcef-48a5-9900-096bf34e2d83', 'dataset': 'ANVIL_CCDG_Broad_MI_BRAVE_GRU_WES_20221107', 'table': 'anvil_activity', 'path': 'gs://fc-2a9eefc3-0302-427f-9ac3-82f078741c03/ingest_pipeline/output/transformed/anvil/700303c2-fcef-48a5-9900-096bf34e2d83/table_data/anvil_activity.json', 'load_tag': 'Ingest for 700303c2-fcef-48a5-9900-096bf34e2d83', 'row_count': 1496, 'bad_row_count': 0, 'load_result': None}"
9,ANVIL_CCDG_Broad_MI_BRAVE_GRU_WES_20221107 (700303c2-fcef-48a5-9900-096bf34e2d83),2023-03-31 13:44:59,Dataset Ingests,Table: anvil_biosample - File: anvil_biosample.json,Success,"Job_ID: 4lJLTcMnSp-vQbj1duD0VQ - Truncated Response: {'dataset_id': '700303c2-fcef-48a5-9900-096bf34e2d83', 'dataset': 'ANVIL_CCDG_Broad_MI_BRAVE_GRU_WES_20221107', 'table': 'anvil_biosample', 'path': 'gs://fc-2a9eefc3-0302-427f-9ac3-82f078741c03/ingest_pipeline/output/transformed/anvil/700303c2-fcef-48a5-9900-096bf34e2d83/table_data/anvil_biosample.json', 'load_tag': 'Ingest for 700303c2-fcef-48a5-9900-096bf34e2d83', 'row_count': 1496, 'bad_row_count': 0, 'load_result': None}"


03/31/2023 01:50:22 PM - INFO: Function completed successfully.
03/31/2023 01:50:22 PM - INFO: Processing md5-added files for Dataset ID = a715c70d-da92-43ee-a851-1a27277909a2
03/31/2023 01:50:22 PM - INFO: Retrieving necessary information from TDR.
03/31/2023 01:50:22 PM - INFO: Processing updates for pggb.
03/31/2023 01:50:25 PM - INFO: 8 records to process.
03/31/2023 01:50:26 PM - INFO: Submitting ingest for updated pggb records.
TDR Job ID: 1fd9nwmgSyu1ChY0zxaJ1Q
03/31/2023 01:51:09 PM - INFO: Ingest succeeded: {'dataset_id': 'a715c70d-da92-43ee-a851-1a27277909a2', 'dataset': 'ANVIL_HPRC_20230310', 'table': 'pggb', 'path': None, 'load_tag': 'File ref fields patch for pggb in a715c70d-da92-43ee-a851-1a27277909a2', 'row_count': 8, 'bad_row_count': 0, 'load_result': {'loadSummary': {'loadTag': 'File ref fields patch for pggb in a715c70d-da92-43ee-a851-1a27277909a2', 'jobId': '1fd9nwmgSyu1ChY0zxaJ1Q', 'totalFiles': 0, 'succeededFiles': 0, 'failedFiles': 0, 'notTriedFiles': 0}, 'loadFi

Unnamed: 0,dataset_id,run_status
0,700303c2-fcef-48a5-9900-096bf34e2d83,Success
1,a715c70d-da92-43ee-a851-1a27277909a2,Failure - Table Processing


In [None]:
# # Testing
# dataset_id = 'bc6075ac-5cfe-4613-8601-36ceb614939e'

# logging.info(f"Processing md5-added files for Dataset ID = {dataset_id}")

# ## Retrieve dataset information
# logging.info("Retrieving necessary information from TDR.")
# src_schema_dict = {}
# api_client = utils.refresh_tdr_api_client()
# datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
# try:
#     response = datasets_api.retrieve_dataset(id=dataset_id, include=["SCHEMA", "ACCESS_INFORMATION"]).to_dict()
#     src_schema_dict["tables"] = response["schema"]["tables"]
#     bq_project = response["access_information"]["big_query"]["project_id"]
#     bq_dataset = response["access_information"]["big_query"]["dataset_name"]
# except Exception as e:
#     logging.error("Error retrieving information from TDR. Exiting function. Error: {}".format(e))
#     #return "Failure - Pre-processing"

# ## Parse TDR schema to identify file reference fields
# table_dict = {}
# for table in src_schema_dict["tables"]:
#     if table["name"] in ["file_inventory", "anvil_file"]:
#         continue
#     else:
#         col_list = []
#         for column in table["columns"]:
#             if column["datatype"] == "fileref":
#                 col_list.append([column["name"], column["array_of"]])
#         if col_list:
#             table_dict[table["name"]] = col_list

# ## Loop through tables and re-process impacted records
# for table in table_dict.keys():
#     logging.info(f"Processing updates for {table}.")
#     # Retrieve relevant records from BigQuery
#     col_list = []
#     old_cols = ""
#     new_cols = ""
#     join_clause = ""
#     where_clause = ""
#     for idx, col in enumerate(table_dict[table]):
#         column_name = col[0]
#         col_list.append(column_name)
#         if idx == 0: 
#             old_cols += column_name
#             where_clause += f"t.{column_name} IN (SELECT file_ref FROM file_list)"
#         else:
#             old_cols += ", " + column_name
#             where_clause += f" OR t.{column_name} IN (SELECT file_ref FROM file_list)"
#         new_cols += f", CASE WHEN t{idx}.source_name IS NOT NULL THEN TO_JSON(STRUCT(t{idx}.source_name AS sourcePath, t{idx}.target_path AS targetPath)) END AS {column_name}"
#         join_clause += f" LEFT JOIN load_hist t{idx} ON t.{column_name} = t{idx}.file_id"

#     query = """WITH 
#         file_list AS (SELECT * FROM `{project}.{dataset}.file_inventory` WHERE md5_hash IS NULL),
#         load_hist AS (SELECT * FROM `{project}.{dataset}.datarepo_load_history` WHERE state = 'succeeded')
#         SELECT t.* EXCEPT({old_cols}){new_cols}
#         FROM `{project}.{dataset}.{table}` t {joins} WHERE {where}""".format(project=bq_project, dataset=bq_dataset, table=table, old_cols=old_cols, new_cols=new_cols, joins=join_clause, where=where_clause)
#     try:
#         client = bigquery.Client()
#         res = client.query(query).result()
#         if res.total_rows > 0:
#             logging.info(f"{res.total_rows} records to process.")
#             df = res.to_dataframe()
#             records_json = df.to_json(orient='records')
#             records_list = json.loads(records_json)
#         else:
#             logging.info("No records to process.")
#             records_list = []
#     except Exception as e:
#         logging.error(f"Error retrieving update records from BigQuery: {str(e)}")
#         break
#         #return "Failure - Table Processing"
#     # Ingest updated records back to TDR dataset
#     try:
#         datarepo_row_ids = []
#         for record in records_list:
#             datarepo_row_ids.append(record.pop("datarepo_row_id", None))
#             for col in col_list:
#                 record[col] = json.loads(record[col])
#         if records_list:
#             ingest_updated_records("e0e03e48-5b96-45ec-baa4-8cc1ebf74c61", dataset_id, table, records_list)
#             delete_old_records(dataset_id, table, datarepo_row_ids)
#     except Exception as e:
#         logging.error(f"Error replacing TDR records: {str(e)}")
#         break
#         #return "Failure - Table Processing"

# # ## Re-process file_inventory
# # logging.info(f"Processing updates for file_inventory.")
# # # Retrieve relevant records from BigQuery
# # query = """WITH 
# #     file_list AS (SELECT file_ref FROM `{project}.{dataset}.file_inventory` WHERE md5_hash IS NULL),
# #     load_hist AS (SELECT * FROM `{project}.{dataset}.datarepo_load_history` WHERE state = 'succeeded')
# #     SELECT t1.*, CASE WHEN t2.source_name IS NOT NULL THEN TO_JSON(STRUCT(t2.source_name AS sourcePath, t2.target_path AS targetPath)) END AS file_ref
# #     FROM `{project}.{dataset}.file_inventory` t1
# #       INNER JOIN load_hist t2 ON t1.file_ref = t2.file_id
# #     WHERE file_ref IN (SELECT file_ref FROM file_list)""".format(project=bq_project, dataset=bq_dataset)
# # try:
# #     client = bigquery.Client()
# #     res = client.query(query).result()
# #     if res.total_rows > 0:
# #         logging.info(f"{res.total_rows} records to process.")
# #         df = res.to_dataframe()
# #         records_json = df.to_json(orient='records')
# #         records_list = json.loads(records_json)
# #     else:
# #         logging.info("No records to process.")
# #         records_list = []
# # except Exception as e:
# #     logging.error(f"Error retrieving update records from BigQuery: {str(e)}")
# #     #return "Failure - File Inventory Processing"
# # # Loop through records and update md5_hash from GCS metadata
# # try:
# #     storage_client = storage.Client()
# #     datarepo_row_ids = []
# #     for record in records_list:
# #         bucket = re.match('gs:\/\/([a-z0-9\-]+)', record["uri"]).group(1)
# #         obj = re.match('gs:\/\/[a-z0-9\-]+\/([A-Za-z0-9\-_\/\.]+)', record["uri"]).group(1)
# #         bucket = storage_client.bucket(bucket, user_project="anvil-datastorage")
# #         blob = bucket.get_blob(obj)
# #         record["md5_hash"] = blob.md5_hash
# #         datarepo_row_ids.append(record.pop("datarepo_row_id", None))
# # except Exception as e:
# #     logging.error(f"Error retrieving file metadata from GCS: {str(e)}")
# #     #return "Failure - File Inventory Processing"
# # # Ingest updated records back to TDR dataset
# # try:
# #     if records_list:
# #         ingest_updated_records("e0e03e48-5b96-45ec-baa4-8cc1ebf74c61", dataset_id, "file_inventory", records_list)
# #         delete_old_records(dataset_id, "file_inventory", datarepo_row_ids)         
# # except Exception as e:
# #     logging.error(f"Error replacing TDR records: {str(e)}")
# #     #return "Failure - File Inventory Processing"

# # ## Empty anvil_% tables
# # logging.info("Clearing out existing anvil_% tables")
# # table_list = ["anvil_activity", "anvil_alignmentactivity", "anvil_antibody", "anvil_assayactivity", "anvil_biosample", "anvil_diagnosis", "anvil_donor", "anvil_file", "anvil_sequencingactivity", "anvil_variantcallingactivity"]
# # for table in table_list:
# #     try:
# #         datarepo_row_ids = collect_all_datarepo_rows(dataset_id, table)
# #         if datarepo_row_ids:
# #             delete_old_records(dataset_id, table, datarepo_row_ids)
# #     except Exception as e:
# #         logging.error(f"Error clearing out existing anvil_% records: {str(e)}")
# #         break
# #         #return "Failure - anvil_% Record Deletion"

# # ## Re-run T pipeline without validation
# # params = {}
# # params["ws_name"] = ws_name
# # params["ws_project"] = ws_project
# # params["ws_bucket"] = ws_bucket
# # params["ws_bucket_name"] = ws_bucket_name
# # params["profile_id"] = "e0e03e48-5b96-45ec-baa4-8cc1ebf74c61" 
# # params["mapping_target"] = "anvil"
# # params["skip_transforms"] = False
# # params["transform_list_override"] = [] # Leave empty to run transforms for all files, otherwise populate with target table names 
# # params["skip_schema_extension"] = False
# # params["skip_ingests"] = False
# # params["ingest_list_override"] = [] # Leave empty to run ingests for all files, otherwise populate with target table names
# # params["skip_file_relation_inference"] = False
# # params["skip_dangling_fk_resolution"] = False
# # params["skip_supplementary_file_identification"] = False
# # params["skip_snapshot_creation"] = False
# # params["snapshot_readers_list"] = ["azul-anvil-prod@firecloud.org"] # Include "auth-domain" to add the auth domain(s) as a reader (if one exists)
# # params["skip_data_validation"] = True
# # try:
# #     api_client = utils.refresh_tdr_api_client()
# #     datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
# #     dataset_info = datasets_api.retrieve_dataset(id=dataset_id, include=["SCHEMA", "ACCESS_INFORMATION", "PROPERTIES"]).to_dict()
# #     dataset_name = dataset_info["name"]
# #     phs_id = dataset_info["phs_id"]
# #     consent_name = dataset_info["properties"]["consent_name"]
# #     auth_domains = dataset_info["properties"]["auth_domains"]
# #     src_workspaces = dataset_info["properties"]["source_workspaces"]
# # except:
# #     dataset_name = ""
# #     return "Failure - Dataset Retrieval for T Pipeline"
# # if dataset_name:
# #     params["dataset_id"] = dataset_id
# #     params["dataset_name"] = dataset_name
# #     params["phs_id"] = phs_id
# #     params["consent_name"] = consent_name
# #     params["auth_domains"] = auth_domains
# #     utils.run_t_pipeline(params)

# # Return success message if no failures recorded
# logging.info("Function completed successfully.")
# #return "Success"


In [None]:
# for idx, record in enumerate(records_list):
#     if record["library_2_estimated_library_size"]:
#         print(str(idx) + " - " + str(record["library_2_estimated_library_size"]))

In [None]:
# records_list[50]

# Add new supplementary workspace files to TDR dataset

## Script to identify new supplementary files and ingest them to TDR dataset

In [None]:
def ingest_supplementary_files(dataset_id):
    
    # Retrieve dataset details
    logging.info("Retrieving dataset details.")
    api_client = utils.refresh_tdr_api_client()
    datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
    dataset_details = datasets_api.retrieve_dataset(id=dataset_id, include=["ACCESS_INFORMATION", "PROPERTIES"]).to_dict()
    try:
        source_workspaces = dataset_details["properties"]["source_workspaces"]
        bq_project = dataset_details["access_information"]["big_query"]["project_id"]
        bq_schema = dataset_details["access_information"]["big_query"]["dataset_name"]
    except Exception as e:
        return "Failure - Issue Retrieving Dataset Info"
    
    # Use source workspace(s) to find workspace bucket(s) to look for new files
    logging.info("Determining source workspace bucket(s).")
    data_files_src_buckets = {}
    for ws in source_workspaces:
        try:
            ws_attributes = utils.get_workspace_attributes("anvil-datastorage", ws)
            src_bucket = ws_attributes["bucketName"] if ws_attributes.get("bucketName") else ""
            if not src_bucket:
                return "Failure - Issue Retrieving Source Buckets"
            elif src_bucket not in data_files_src_buckets:
                data_files_src_buckets[src_bucket] = {
                    "include_dirs": [],
                    "exclude_dirs": []
                }
        except Exception as e:
            return "Failure - Issue Retrieving Source Buckets"
    
    # Pull existing file inventory from BigQuery
    logging.info("Pulling existing file inventory records.")
    client = bigquery.Client()
    query = """SELECT uri FROM `{project}.{schema}.file_inventory`""".format(project = bq_project, schema = bq_schema)
    file_list = []
    try:
        output = client.query(query).result()
        if output.total_rows > 0:
            for row in output:
                file_list.append(row.uri)
    except Exception as e:
            return "Failure - Issue Retrieving Existing File Inventory Records"
        
    # Build file inventory from workspace bucket(s)
    logging.info("Building new file inventory.")
    params = {}
    params["data_files_src_buckets"] = data_files_src_buckets
    params["google_project"] = "terra-349c8d95"
    params["file_inventory_dir"] = "ingest_pipeline/input/temp/data_files/file_inventory"
    inventory, retry_count = bfi.build_inventory(params)
    
    # Diff files to ingest
    logging.info("Diffing new and existing file inventory records.")
    ingest_list = []
    for file in inventory:
        if file["uri"] not in file_list:
            ingest_list.append(file)
    df_inventory = pd.DataFrame(ingest_list)
    records_dict = df_inventory.to_dict(orient="records")
    return records_dict
    
    # Build, submit, and monitor ingest request
    ingest_request = {
        "table": "file_inventory",
        "profile_id": "e0e03e48-5b96-45ec-baa4-8cc1ebf74c61",
        "ignore_unknown_values": True,
        "resolve_existing_files": True,
        "updateStrategy": "replace",
        "format": "array",
        "load_tag": "Supplementary file ingest for {}".format(dataset_id),
        "records": records_dict
    }
    attempt_counter = 0
    while True:
        try:
            api_client = utils.refresh_tdr_api_client()
            datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
            ingest_request_result, job_id = utils.wait_for_tdr_job(datasets_api.ingest_dataset(id=dataset_id, ingest=ingest_request))
            return "Success"
        except Exception as e:
            logging.error("Error on file relationships inference ingest: {}".format(str(e)))
            attempt_counter += 1
            if attempt_counter < 2:
                sleep(10)
                continue
            else:
                return f"Failure - Ingest error: {str(e)}"
    
# # Loop through datasets and ingest additional files if necessary
# dataset_id_list = [
# 'd74b26d5-24bb-4696-84c3-bcd1f5f90b08',
# ]
# results = []
# for dataset_id in dataset_id_list:
#     status = ingest_supplementary_files(dataset_id) 
#     results.append([dataset_id, status])
#     results_df = pd.DataFrame(results, columns = ["dataset_id", "run_status"])
# display(results_df)


In [None]:
dataset_id = 'bf9108b6-bebc-4b3b-8517-6a2cce5f7d89'

# Retrieve dataset details
logging.info("Retrieving dataset details.")
api_client = utils.refresh_tdr_api_client()
datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
dataset_details = datasets_api.retrieve_dataset(id=dataset_id, include=["ACCESS_INFORMATION", "PROPERTIES"]).to_dict()
try:
    source_workspaces = dataset_details["properties"]["source_workspaces"]
    bq_project = dataset_details["access_information"]["big_query"]["project_id"]
    bq_schema = dataset_details["access_information"]["big_query"]["dataset_name"]
except Exception as e:
    print("Failure - Issue Retrieving Dataset Info") 

# Use source workspace(s) to find workspace bucket(s) to look for new files
logging.info("Determining source workspace bucket(s).")
data_files_src_buckets = {}
for ws in source_workspaces:
    try:
        ws_attributes = utils.get_workspace_attributes("anvil-datastorage", ws)
        src_bucket = ws_attributes["bucketName"] if ws_attributes.get("bucketName") else ""
        if not src_bucket:
            print("Failure - Issue Retrieving Source Buckets")
        elif src_bucket not in data_files_src_buckets:
            data_files_src_buckets[src_bucket] = {
                "include_dirs": [],
                "exclude_dirs": []
            }
    except Exception as e:
        print("Failure - Issue Retrieving Source Buckets")

# Pull existing file inventory from BigQuery
logging.info("Pulling existing file inventory records.")
client = bigquery.Client()
query = """SELECT uri FROM `{project}.{schema}.file_inventory`""".format(project = bq_project, schema = bq_schema)
print(query)
file_list = []
try:
    output = client.query(query).result()
    if output.total_rows > 0:
        for row in output:
            file_list.append(row.uri)
except Exception as e:
        print("Failure - Issue Retrieving Existing File Inventory Records")

# Build file inventory from workspace bucket(s)
logging.info("Building new file inventory.")
params = {}
params["data_files_src_buckets"] = data_files_src_buckets
params["google_project"] = "terra-349c8d95"
params["file_inventory_dir"] = "ingest_pipeline/input/temp/data_files/file_inventory"
inventory, retry_count = bfi.build_inventory(params)

# Diff files to ingest
logging.info("Diffing new and existing file inventory records.")
ingest_list = []
for file in inventory:
    if file["uri"] not in file_list:
        ingest_list.append(file)
df_inventory = pd.DataFrame(ingest_list)
records_list = df_inventory.to_dict(orient="records")
records_cnt = len(records_list)
logging.info(f"New file inventory records to ingest: {records_cnt}")

# Break records to ingest into chunks if necessary
chunk_size = 100000
chunk_cnt = math.ceil(records_cnt/chunk_size)
for i in range(0, chunk_cnt):
    if i == 0:
        start_row = 0
        end_row = chunk_size
    else:
        start_row = (i*chunk_size) + 1
        end_row = min((i+1)*chunk_size, records_cnt)
    # Write out chunk to file for ingest
    destination_file = "file_inventory_" + str(i) + ".json"
    with open(destination_file, "w") as outfile:
        for idx, val in enumerate(records_list):
            if idx >= start_row and idx <= end_row:
                json.dump(val, outfile)
                if idx < end_row:
                    outfile.write("\n")
    !gsutil cp $destination_file $ws_bucket/ingest_pipeline/input/temp 2> stdout   
    # Build, submit, and monitor ingest request
    logging.info(f"Ingesting new file inventory records into TDR (chunk #{i}).")
    ingest_request = {
        "table": "file_inventory",
        "profile_id": "e0e03e48-5b96-45ec-baa4-8cc1ebf74c61",
        "ignore_unknown_values": True,
        "resolve_existing_files": True,
        "updateStrategy": "replace",
        "format": "json",
        "load_tag": "Supplementary file ingest for {}".format(dataset_id),
        "bulkMode": True,
        "path": f"{ws_bucket}/ingest_pipeline/input/temp/{destination_file}"
    }
    attempt_counter = 0
    while True:
        try:
            api_client = utils.refresh_tdr_api_client()
            datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
            ingest_request_result, job_id = utils.wait_for_tdr_job(datasets_api.ingest_dataset(id=dataset_id, ingest=ingest_request))
            print("Success")
            break
        except Exception as e:
            logging.error("Error on new file inventory records ingest: {}".format(str(e)))
            attempt_counter += 1
            if attempt_counter < 2:
                sleep(10)
                continue
            else:
                print(f"Failure - Ingest error (chunk #{i}): {str(e)}")
                break