# Imports

In [None]:
# Install additional modules (one time effort per cloud environment)
!pip install --upgrade import_ipynb data_repo_client urllib3 xmltodict

In [1]:
## imports and environment variables

# Imports
import import_ipynb
import pandas as pd
import os
import re
import json
import data_repo_client
from google.cloud import bigquery
import ingest_pipeline_utilities as utils
import build_file_inventory as bfi
import identify_supplementary_files as isf
import logging
from time import sleep
import datetime
from google.cloud import storage
import math
import csv
import numpy as np

# Configure logging format
logging.basicConfig(format="%(asctime)s - %(levelname)s: %(message)s", datefmt="%m/%d/%Y %I:%M:%S %p", level=logging.INFO)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option("display.max_colwidth", None)
pd.set_option('display.width', 1000)
pd.set_option('display.colheader_justify', 'center')
pd.set_option('display.precision', 3)

# Environment variables
ws_name = os.environ["WORKSPACE_NAME"]
ws_project = os.environ["WORKSPACE_NAMESPACE"]
ws_bucket = os.environ["WORKSPACE_BUCKET"]
ws_bucket_name = re.sub('^gs://', '', ws_bucket)


importing Jupyter notebook from ingest_pipeline_utilities.ipynb
Version 1.0.40: 4/9/2024 11:23am - Nate Calvanese - Fixed file inventory bug when fileref fields contained mix of remote and nonremote files.
importing Jupyter notebook from source_files_creation.ipynb
Version 1.0.9: 2/25/2023 3:15pm - Nate Calvanese - Replaced FAPI with utils functions
importing Jupyter notebook from build_file_inventory.ipynb
Version 2.0.3: 10/6/2023 9:29am - Nate Calvanese - Tweaked file extension parsing logic
importing Jupyter notebook from process_table_data.ipynb
Version: 1.0.10: 1/12/2024 11:25am - Nate Calvanese - Made max_combined_rec_ref_size configurable
importing Jupyter notebook from build_mapping_query.ipynb
Version 1.0.13: 2/1/2024 4:16pm - Nate Calvanese - Updated logic to not include field in select statement when source table cant be joined to
importing Jupyter notebook from output_data_validation.ipynb
Version 2.0.7: 12/13/2023 1:13pm -- Replaced deprecated df append with pd.concat
impo

# Create new snapshot

## Script to create new full view snapshot

In [None]:
# Parameters
params = {}
params["profile_id"] = "e0e03e48-5b96-45ec-baa4-8cc1ebf74c61"
params["snapshot_readers_list"] = ["azul-anvil-prod@firecloud.org", "auth-domain"]
params["anvil_schema_version"] = "ANV5"

# Loop through datasets and create new snapshot
dataset_id_run_list = [
'93b2ac60-2208-4ef8-a1c2-68a623e45807',
'f9224ea2-dd31-421d-80d4-f35082ef8d68',
'3376a8b6-7ef6-4191-97ab-a547da0d330d',
'0b25d09e-b2d9-4452-9810-1d0ef777f9d6',
'6ac178b7-a923-407f-8cd8-1733e1b2ebd5',
'4b341ba9-49a5-43a2-9b7e-cc96beb59946',
'841970b7-bed0-4a75-a28a-a4cc59740a84',
'a5f53fc8-8f9b-4e9a-af63-6f8c54d478b2',
'ed82e510-37aa-47f6-88f0-b2ba33e0fdb0',
'9a06c401-da3f-41b4-b38b-238796fcae09',
'2a81cd6f-aa6e-436b-b4ba-68d5f713fb07',
'5e0e8f9a-ce97-4b18-9540-3015c61e393c',
'1c8ba244-1c7f-433a-825b-d2d34d018dcf',
'6d18aafc-0240-499c-902e-a72a5b98ff0a',
'74d1e549-5ae8-4410-9428-f8f2cc85fa80',
'033fc1e1-0337-4656-bbe1-3f06fef641e9',
'629e31cb-dd7b-4345-abf2-fa23c6c65a09',
'0132f320-830d-40d0-a4da-06a5d5f9e8d9',
'bb7d6408-941a-4da6-8613-36498bc6d91b',
'd40af129-c13f-45b2-92f0-d0e8fa5cc1c9',
'ecd2d2f9-2b6f-4743-8d04-c9bb554a96cb',
]
results = []
for dataset in dataset_id_run_list:
    dataset_id = dataset
    try:
        api_client = utils.refresh_tdr_api_client()
        datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
        dataset_info = datasets_api.retrieve_dataset(id=dataset_id, include=["SCHEMA", "ACCESS_INFORMATION", "PROPERTIES"]).to_dict()
        dataset_name = dataset_info["name"]
        phs_id = dataset_info["phs_id"]
        consent_name = dataset_info["properties"]["consent_name"]
        auth_domains = dataset_info["properties"]["auth_domains"]
        src_workspaces = dataset_info["properties"]["source_workspaces"]
    except:
        dataset_name = ""
    if dataset_name:
        params["ws_bucket"] = ws_bucket
        params["dataset_id"] = dataset_id
        params["dataset_name"] = dataset_name
        params["phs_id"] = phs_id
        params["consent_name"] = consent_name
        params["auth_domains"] = auth_domains
        params["pipeline_results"] = []
        current_datetime = datetime.datetime.now()
        current_datetime_string = current_datetime.strftime("%Y%m%d%H%M")
        params["snapshot_name"] = params["dataset_name"] + "_" + params["anvil_schema_version"] + "_" + current_datetime_string 
        utils.create_and_share_snapshot(params)
        int_df_results = pd.DataFrame(params["pipeline_results"], columns = ["Dataset", "Time", "Step", "Task", "Status", "Message"])
        errors = int_df_results[int_df_results["Status"].str.contains("Error")]
        if len(errors) > 0:
            results.append([dataset_id, "Error", ""])
        else:
            snapshot_id = re.search("{'id': '([a-z0-9]{8}-[a-z0-9]{4}-[a-z0-9]{4}-[a-z0-9]{4}-[a-z0-9]{12})'", str(int_df_results[int_df_results["Task"]=="Create and Share Snapshot"]["Message"]))[1]
            results.append([dataset_id, "Success", snapshot_id])
results_df = pd.DataFrame(results, columns = ["dataset_id", "run_status", "snapshot_id"])
display(results_df)


## Verify Snapshots Have Properly Formatted DRS URI

In [None]:
def validate_snapshot_drs_format(snapshot_id):
    
    # Retrieve snapshot information
    api_client = utils.refresh_tdr_api_client()
    snapshots_api = data_repo_client.SnapshotsApi(api_client=api_client)
    try:
        response = snapshots_api.retrieve_snapshot(id=snapshot_id, include=["ACCESS_INFORMATION"]).to_dict()
        bq_project = response["access_information"]["big_query"]["project_id"]
        bq_dataset = response["access_information"]["big_query"]["dataset_name"]
    except Exception as e:
        return "Failure - Issue Retrieving Snapshot Info"
    
    # Determine if field exists for dataset, continue if so, fail otherwise
    client = bigquery.Client()
    query = """SELECT COUNT(file_ref) AS rec_cnt, COUNT(CASE WHEN file_ref LIKE '%drs://drs.anv0:v2_%' THEN file_ref END) AS valid_cnt
                FROM `{project}.{dataset}.anvil_file`""".format(project=bq_project, dataset=bq_dataset)
    try:
        df = client.query(query).result().to_dataframe()
        if df["rec_cnt"].values[0] == df["valid_cnt"].values[0]:
            return "Success"
        else:
            rec_cnt = df["rec_cnt"].values[0]
            valid_cnt = df["valid_cnt"].values[0]
            return f"Failure: Only {valid_cnt} of {rec_cnt} records properly formatted"
    except Exception as e:
        return "Failure - BigQuery Error"

# Loop through datasets and validate is_supplementary field
snapshot_id_list = [
'c3d22305-b3f2-4561-a5b9-bed82ee742f4',
'9fe2abd4-70b4-4eee-b00d-38726ced8620',
'5329c25e-ccad-435d-9250-6fcc3ff88472',
'ced601b2-9a11-40e9-8067-241e5a5996ed',
'8165245c-2003-4ec7-bf57-731959022d47',
'737d454c-88be-477f-ae2c-ef473e2106ce',
'3bdbad9e-f9d4-4442-8606-791d490bf0af',
'cd19195f-25a0-44b1-b47d-ec99141833fc',
'b897e519-ba8b-4758-a263-6d57bd3b8e2b',
'1d385cfc-4bed-4f52-8f7b-ea54fc44b4f7',
'02d25240-823f-4b1d-8562-95385716a453',
'1974a21b-c409-4736-a3d7-e195fa96c4eb',
'99b46287-4790-492c-8a12-bea33f0f927c',
'c6ef5822-3929-4ae7-b5bc-dc27528bf226',
'08d19a7e-b868-4766-9f7e-d879d972cbd7',
'35186e6d-2728-4a8e-b0ad-6b34d0fe480c',
'b0d176bf-d094-4e33-a34b-b83a94de86ea',
'cc6bacc8-29fa-4d97-8856-79f52ea50c6f',
'85b721da-ad8e-4d82-93f0-0988f94af22e',
'407c7800-3ab4-4b13-ba45-c6c13c1c2278',
'2529f127-cff5-43ff-b879-06bc0e3468ff',
'b511be0b-7dc5-4767-a891-37f43d04a5a5',
'a7e031c3-62d4-46db-b2e2-0bdca6bbad65',
'5bba97dc-d6ab-4329-912f-148c8b807056',
'9cf61d88-d096-4981-b0c6-99db77554c01',
'4c722626-c559-4f5a-84bd-8d7d46983e1e',
'7c237e08-3329-4e64-bd2a-063be290e78b',
'4117144f-92e7-454f-9263-dad5e128cadb',
'ce2e7235-26e6-470f-8e05-298193b7f53d',
'6df525e1-b143-4e6f-b667-80c783ae1b66',
'92666b7c-4d50-4530-88e9-ea2d3da9d07a',
'42644c25-fa23-4b4e-8fcc-907cd8dcef60',
'155c11a9-638a-45c8-b172-7cf2e3e16fe6',
'b3da9fec-08ad-4496-a9ac-1411388fb5cc',
'0de07296-e3ff-4fe6-9183-9f421484197c',
'1b6273c6-7769-4daf-abee-93b11b322c73',
'ea50255a-45a4-4846-82e3-02b4f46f5b17',
'eb7045e1-2286-49f1-bce6-21b5d7fa5c32',
'b763c288-4132-434a-a6c9-25ad51b9d961',
'b67702a8-307d-4b20-835e-c0245d0761e5',
'88548251-e59e-4bc3-b71a-f1e9e2369919',
'd3dc5627-503b-48a5-ad79-31ab6c2fd417',
'ec14f8cd-5b1b-4124-a235-f11159984c7c',
'6d9e1212-4fa6-4632-be8a-75c45a474dd3',
'667eac9b-4e90-413d-80f3-d857b9829ab7',
'c091ea30-1862-4b1f-8e92-087b441472c3',
'43c86818-9bfe-46f2-9ae4-4a55a7baef1f',
'ebdaca04-ef29-42f3-8486-a94dade81bf8',
'f8781fbf-5fef-4481-8819-3df1bc724b7f',
'830df9ed-e4a6-4c9a-a97a-aa080fb030e4',
'84703c54-a9dd-400c-9701-2fc40922e3e3',
'c1c674dd-056a-470c-8874-bf70d8fae3a8',
'6a5b3be6-d1de-4f23-a431-b08e7ab231b8',
'ffe34538-3ddd-48de-b4a2-94f9b2dad086',
'2c6de04e-104d-42c8-8448-97d74985dacb',
'2a1882d9-88ca-4849-bcc1-f6914f593407',
'bf2f4106-cee9-419c-b4d1-d7b03a6293d5',
'a6c36f5e-b86c-4164-85ae-8bf0df2e4a90',
'7c19d852-e36a-4353-afea-10e501601d9a',
'00297802-e20a-413f-b389-a6f764b6600e',
'b8a455eb-827d-43a0-a89b-5d017747140f',
'3e85b06a-a6ea-4ce8-a655-44b1fce12138',
'9321b908-f2e4-437b-b53e-ed81754dcace',
'172bada7-f1c5-41c4-836d-05381beaed9a',
'133e902c-5ff0-4119-8078-db3e15006844',
'452bcafd-ab45-4e24-b5e0-13fcf22b0755',
'5e547934-c339-410e-a013-dfefed50f4b8',
'ffa84feb-ca0e-43d3-a04d-a402a8e24a3b',
'ff27037e-cb52-44ef-8979-f6e7ac3ed6f6',
'c853d4c0-d4be-433d-964e-e30bdc35480e',
'8fbe2def-b8ad-4b2d-90c9-0dd4517c67e1',
'03e54581-8fd3-47c3-9143-55368d2e4e86',
'9efae3c7-904c-48a8-939a-e82b46005ae1',
'5955a235-5be6-47bc-8303-ed0c4e68f501',
'e04edfef-69f8-47ff-8df9-dfff0e9218d2',
'f2a7be5a-4f7a-4a96-935e-ca7592855b45',
'7c90289b-be3e-4c9b-917a-d5e27d95dc15',
'0f46a588-b4ff-4a69-99e9-0a0bcf052522',
'cdd689fd-10f3-4cfa-b738-46549e689cac',
'eb7948be-1007-4b0e-b9b6-a5c40bbb9596',
'f20753f0-d09a-4b47-bffe-8f24ec354761',
'4cff04f4-eff9-4a62-bc6e-691accfbd328',
'9a61b980-4a33-465a-bc50-1aba00bc2cf6',
'90fe2016-e79c-456c-a5f9-3a31149fcd65',
'a4c62d7f-34f0-4e2e-9e46-c762d3ab0ff2',
'28dc8121-5e55-46c2-8313-681de2298986',
'dcc578ed-44bb-458f-8ff5-a78ca83f4616',
'aa42debe-3747-4dcd-8bc9-24eb90673fa5',
'5208772d-21f9-46b0-8167-0b05b57296b8',
'a2da748b-fec8-4e10-88ee-de32cbe8dee1',
'26df2a34-b10d-4361-ba2b-d9f966d09f61',
'dd00a8ba-ac49-481b-8d79-0e440adafd77',
'0df983d7-ed5e-44d2-acf1-686822b0cc7e',
'28559e94-ed57-48c8-bc8b-6cc4ad659a61',
'8b385bd3-52aa-48b9-be33-41f4d3fd4531',
'ce1bf5c3-525e-455d-a1e9-dd5f3d68c9d3',
'd0a6aa4c-821c-4bba-b53b-4f230ca3cda4',
'd9e817a2-6657-433b-8b2f-73790561725c',
'33c854eb-d228-4a82-8324-5e455ed1e447',
]
results = []
for snapshot_id in snapshot_id_list:
    status = validate_snapshot_drs_format(snapshot_id) 
    results.append([snapshot_id, status])
    results_df = pd.DataFrame(results, columns = ["snapshot_id", "validation_status"])
display(results_df)

# Add and populate anvil_file.is_supplementary

## Script to patch dataset

In [None]:
# Set base parameters
params = {}
params["profile_id"] = "e0e03e48-5b96-45ec-baa4-8cc1ebf74c61"

# Loop through datasets and process is_supplementary field
dataset_id_list = [
'8b2b1c92-66cf-403c-8eb0-03b523d1550e',
'595b6755-e7ae-4e83-af2e-693c089aeec3',
'84ac0d05-4be5-43e9-973e-ef999144d802',
'732eaae3-b509-4a7a-8961-09d861e55253',
'544f643d-b19f-4aa0-a6ec-a90e1a8681d6',
'f85ea65e-1943-4bd6-a541-71c5d8465ca9',
'280c5d6f-39a3-4d1d-aad2-a174451cd9b2',
]
results = []
for dataset_id in dataset_id_list:
    logging.info(f"Patching dataset_id: {dataset_id}")
    params["t_output_dir"] = "ingest_pipeline/output/transformed/anvil/{}/table_data".format(dataset_id)
    output, status = isf.identify_supplementary_files(params, dataset_id)
    results.append([dataset_id, status, output])
    results_df = pd.DataFrame(results, columns = ["dataset_id", "run_status", "output"])
display(results_df)


## Script to validate patch worked properly

In [None]:
def validate_supp_file_flg(dataset_id):
    
    # Retrieve dataset information
    src_schema_dict = {}
    api_client = utils.refresh_tdr_api_client()
    datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
    try:
        response = datasets_api.retrieve_dataset(id=dataset_id, include=["SCHEMA", "ACCESS_INFORMATION"]).to_dict()
        src_schema_dict["tables"] = response["schema"]["tables"]
        bq_project = response["access_information"]["big_query"]["project_id"]
        bq_dataset = response["access_information"]["big_query"]["dataset_name"]
    except Exception as e:
        return "Failure - Issue Retrieving Dataset Info"
    
    # Determine if field exists for dataset, continue if so, fail otherwise
    field_found = False
    for table in src_schema_dict["tables"]:
        if table["name"] == "anvil_file":
            for col in table["columns"]:
                if col["name"] == "is_supplementary":
                    field_found = True
                    break
            break
    if field_found == False:
        return "Failure - is_supplementary field not found"
    else:
        client = bigquery.Client()
        # Check field population
        query = """SELECT COUNT(*) AS rec_cnt, COUNT(is_supplementary) AS populated_cnt
                    FROM `{project}.{dataset}.anvil_file`""".format(project=bq_project, dataset=bq_dataset)
        try:
            df = client.query(query).result().to_dataframe()
            if df["rec_cnt"].values[0] == df["populated_cnt"].values[0]:
                pass
            else:
                return "Failure - is_supplementary field not populated"
        except Exception as e:
            return "Failure - BigQuery Error"
        # Check field logic
        validation_query = """
            WITH activity_agg
            AS
            (
              SELECT used_biosample_id, generated_file_id, used_file_id FROM `{project}.{dataset}.anvil_activity`
              UNION ALL 
              SELECT [] AS used_biosample_id, generated_file_id, used_file_id FROM `{project}.{dataset}.anvil_alignmentactivity`
              UNION ALL 
              SELECT used_biosample_id, generated_file_id, [] AS used_file_id FROM `{project}.{dataset}.anvil_assayactivity`
              UNION ALL 
              SELECT used_biosample_id, generated_file_id, [] AS used_file_id FROM `{project}.{dataset}.anvil_sequencingactivity`
              UNION ALL 
              SELECT [] AS used_biosample_id, generated_file_id, used_file_id FROM `{project}.{dataset}.anvil_variantcallingactivity`
            ),
            activity_exp 
            AS
            (
              SELECT file_id, int_file_id, biosample_id
              FROM activity_agg
                  LEFT JOIN UNNEST(used_biosample_id) AS biosample_id
                  LEFT JOIN UNNEST(generated_file_id) as file_id
                  LEFT JOIN UNNEST(used_file_id) as int_file_id
            ),
            activity_exp_tagged
            AS
            (
              SELECT a.file_id, b.is_supplementary AS file_id_supp, int_file_id, c.is_supplementary AS int_file_id_supp, biosample_id
              FROM activity_exp a
                  LEFT JOIN  `{project}.{dataset}.anvil_file` b
                  ON a.file_id = b.file_id
                  LEFT JOIN  `{project}.{dataset}.anvil_file` c
                  ON a.int_file_id = c.file_id 
            )
            SELECT CASE WHEN file_id_supp = TRUE AND biosample_id IS NOT NULL THEN 'Supplemental File Linked to BioSample' WHEN (file_id_supp = TRUE AND int_file_id_supp = FALSE) OR (file_id_supp = FALSE AND int_file_id_supp = TRUE) THEN 'Supplemental File Linked to Non-Supplemental File' ELSE 'No Issue Found' END AS finding, COUNT(*) AS occurrences
            FROM activity_exp_tagged
            GROUP by finding
            """.format(project=bq_project, dataset=bq_dataset)
        try:
            df = client.query(validation_query).result().to_dataframe()
            records_json = json.loads(df.to_json(orient='records'))
            supp_linked_to_biosample = 0
            supp_linked_to_nonsupp = 0
            non_issue_links = 0
            for record in records_json:
                if record["finding"] == "Supplemental File Linked to BioSample":
                    supp_linked_to_biosample = record["occurrences"]
                elif record["finding"] == "Supplemental File Linked to Non-Supplemental File":
                    supp_linked_to_nonsupp = record["occurrences"]
                else:
                    non_issue_links = record["occurrences"]
            if supp_linked_to_biosample > 0 or supp_linked_to_nonsupp > 0:
                err_msg = f"Failure - Errors found when validating supplementary files flagged in the TDR dataset: Supplemental Files Linked to a Biosample: {str(supp_linked_to_biosample)} Supplemental Files Linked to a Non-Supplemental File: {str(supp_linked_to_nonsupp)} Links with No Issues: {str(non_issue_links)}"
                return err_msg
        except Exception as e:
            return "Failure - BigQuery Error"
        return "Success"  

# Loop through datasets and validate is_supplementary field
dataset_id_list = [
'8b2b1c92-66cf-403c-8eb0-03b523d1550e',
'595b6755-e7ae-4e83-af2e-693c089aeec3',
'84ac0d05-4be5-43e9-973e-ef999144d802',
'732eaae3-b509-4a7a-8961-09d861e55253',
'544f643d-b19f-4aa0-a6ec-a90e1a8681d6',
'f85ea65e-1943-4bd6-a541-71c5d8465ca9',
'280c5d6f-39a3-4d1d-aad2-a174451cd9b2',
]
results = []
for dataset_id in dataset_id_list:
    logging.info(f"Validating dataset_id: {dataset_id}")
    status = validate_supp_file_flg(dataset_id) 
    results.append([dataset_id, status])
    results_df = pd.DataFrame(results, columns = ["dataset_id", "validation_status"])
display(results_df)

# Attempt to populate anvil_donor.organism_type

## Script to patch dataset

In [None]:
def populate_organism_type(dataset_id):
    logging.info(f"Processing anvil_donor.organism_type for Dataset ID = {dataset_id}")
    
    # Retrieve dataset information
    logging.info("Retrieving necessary information from TDR.")
    api_client = utils.refresh_tdr_api_client()
    datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
    try:
        response = datasets_api.retrieve_dataset(id=dataset_id, include=["ACCESS_INFORMATION"]).to_dict()
        bq_project = response["access_information"]["big_query"]["project_id"]
        bq_dataset = response["access_information"]["big_query"]["dataset_name"]
    except Exception as e:
        logging.error("Error retrieving information from TDR. Exiting function. Error: {}".format(e))
        return "Failure"

    # Re-process anvil_donor data to include organism_type (where available)
    logging.info("Re-processing existing anvil_donor data to include organism_type value.")
    client = bigquery.Client()
    target_file = "anvil_donor.json"
    destination_dir = "ingest_pipeline/output/transformed/anvil/{}/table_data".format(dataset_id)
    query = """SELECT donor_id, 
    (SELECT MAX(CASE WHEN REGEXP_CONTAINS(value, '(h37|h38|h39|hg16|hg17|hg18|hg19|hs37|hs38|b37)') THEN 'Homo sapiens' END) AS organism_type FROM `{project}.{dataset}.workspace_attributes` WHERE attribute = 'library:reference') AS organism_type,
    part_of_dataset_id, phenotypic_sex, reported_ethnicity, genetic_ancestry, source_datarepo_row_ids
    FROM `{project}.{dataset}.anvil_donor`""".format(project=bq_project, dataset=bq_dataset)
    try:
        df = client.query(query).result().to_dataframe()
        records_json = df.to_json(orient='records') 
        records_list = json.loads(records_json)
        records_cnt = len(records_list)
        with open(target_file, 'w') as outfile:
            for idx, val in enumerate(records_list):
                json.dump(val, outfile)
                if idx < (records_cnt - 1):
                    outfile.write('\n')
        !gsutil cp $target_file $ws_bucket/$destination_dir/ 2> stdout
        !rm $target_file
        logging.info("Successfully created new anvil_donor.json file.")
    except Exception as e:
        logging.error("Error creating new anvil_donor.json file. Exiting function. Error: {}".format(str(e)))
        return "Failure"

    # Ingest updated anvil_donor data
    logging.info("Ingesting updated anvil_donor data into TDR dataset.")
    source_full_file_path = "{}/{}/{}".format(ws_bucket, destination_dir, "anvil_donor.json")
    ingest_request = {
        "table": "anvil_donor",
        "profile_id": "e0e03e48-5b96-45ec-baa4-8cc1ebf74c61",
        "ignore_unknown_values": True,
        "resolve_existing_files": True,
        "updateStrategy": "replace",
        "format": "json",
        "load_tag": "Ingest for {}".format(dataset_id),
        "path": source_full_file_path
    }
    attempt_counter = 0
    while True:
        try:
            api_client = utils.refresh_tdr_api_client()
            datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
            ingest_request_result, job_id = utils.wait_for_tdr_job(datasets_api.ingest_dataset(id=dataset_id, ingest=ingest_request))
            logging.info("Ingest from file anvil_donor.json succeeded: {}".format(str(ingest_request_result)[0:1000]))
            break
        except Exception as e:
            logging.error("Error on Dataset Ingest: {}".format(str(e)))
            attempt_counter += 1
            if attempt_counter < 2:
                logging.info("Retrying Dataset Ingest (attempt #{})...".format(str(attempt_counter)))
                sleep(10)
                continue
            else:
                logging.error("Maximum number of retries exceeded. Exiting function.")
                return "Failure"

    # Return success message if no failures recorded
    logging.info("Function completed successfully.")
    return "Success"

# Loop through datasets and process supplementary_file_flag
dataset_id_list = [
'd74b26d5-24bb-4696-84c3-bcd1f5f90b08',
]
results = []
for dataset_id in dataset_id_list:
    status = populate_organism_type(dataset_id) 
    results.append([dataset_id, status])
    results_df = pd.DataFrame(results, columns = ["dataset_id", "run_status"])
display(results_df)


## Script to examine organism_type population

In [None]:
def validate_organism_type(dataset_id):
    
    # Retrieve dataset information
    src_schema_dict = {}
    api_client = utils.refresh_tdr_api_client()
    datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
    try:
        response = datasets_api.retrieve_dataset(id=dataset_id, include=["SCHEMA", "ACCESS_INFORMATION"]).to_dict()
        src_schema_dict["tables"] = response["schema"]["tables"]
        bq_project = response["access_information"]["big_query"]["project_id"]
        bq_dataset = response["access_information"]["big_query"]["dataset_name"]
    except Exception as e:
        return "Failure - Issue Retrieving Dataset Info"
    
    # Determine if field exists for dataset, continue if so, fail otherwise
    client = bigquery.Client()
    query = """SELECT COUNT(organism_type) AS populated_cnt
                FROM `{project}.{dataset}.anvil_donor`""".format(project=bq_project, dataset=bq_dataset)
    try:
        df = client.query(query).result().to_dataframe()
        if df["populated_cnt"].values[0] > 0:
            return "Success - Field Populated"
        else:
            return "Success - Field Not Populated"
    except Exception as e:
        return "Failure - BigQuery Error"

# Loop through datasets and validate is_supplementary field
dataset_id_list = [
'd74b26d5-24bb-4696-84c3-bcd1f5f90b08',
]
results = []
for dataset_id in dataset_id_list:
    status = validate_organism_type(dataset_id) 
    results.append([dataset_id, status])
    results_df = pd.DataFrame(results, columns = ["dataset_id", "validation_status"])
display(results_df)

# Update references to md5-added files

In [None]:
# Function to collect all datarepo rows for a particular table within a dataset
def collect_all_datarepo_rows(dataset_id, table_name):
    api_client = utils.refresh_tdr_api_client()
    datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
    try:
        response = datasets_api.retrieve_dataset(id=dataset_id, include=["ACCESS_INFORMATION"]).to_dict()
        bq_project = response["access_information"]["big_query"]["project_id"]
        bq_schema = response["access_information"]["big_query"]["dataset_name"]
    except Exception as e:
        logging.error("Error retrieving BQ project and schema: {}".format(str(e)))
    client = bigquery.Client()
    query = "SELECT datarepo_row_id FROM `{project}.{schema}.{table}`".format(project = bq_project, schema = bq_schema, table = table_name)
    try:
        query_job = client.query(query)
        results = [row["datarepo_row_id"] for row in query_job]
        return results
    except Exception as e:
        logging.error("Error retrieving datarepo_row_id list: {}".format(str(e)))
        raise Exception(e)

# Function to delete rows from a dataset
def delete_old_records(dataset_id, table, datarepo_row_ids):
    logging.info(f"Attempting to delete original {table} records.")
    if datarepo_row_ids:
        data_deletion_payload = {
            "deleteType": "soft",
            "specType": "jsonArray",
            "tables": [{
              "tableName": table,
              "jsonArraySpec": {
                "rowIds": datarepo_row_ids
              }
            }]
        }
        api_client = utils.refresh_tdr_api_client()
        datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
        try:
            data_deletion_result, job_id = utils.wait_for_tdr_job(datasets_api.apply_dataset_data_deletion(id=dataset_id, data_deletion_request=data_deletion_payload))
            logging.info("Result: {}".format(data_deletion_result))
        except Exception as e:
            logging.info("Error: {}".format(str(e)))
            raise Exception(e)
    else:
        logging.info("No datarepo_row_ids specified for deletion.")

def ingest_updated_records(profile_id, dataset_id, table, records_dict):
    logging.info(f"Submitting ingest for updated {table} records.")
    
    # Build, submit, and monitor ingest request
    ingest_request = {
        "table": table,
        "profile_id": profile_id,
        "ignore_unknown_values": True,
        "resolve_existing_files": True,
        "updateStrategy": "replace",
        "format": "array",
        "bulkMode": False,
        "load_tag": f"File ref fields patch for {table} in {dataset_id}",
        "records": records_dict
    }
    attempt_counter = 0
    while True:
        try:
            api_client = utils.refresh_tdr_api_client()
            datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
            ingest_request_result, job_id = utils.wait_for_tdr_job(datasets_api.ingest_dataset(id=dataset_id, ingest=ingest_request))
            logging.info("Ingest succeeded: {}".format(str(ingest_request_result)[0:1000]))
            status = "Success"
            return
        except Exception as e:
            logging.error("Error on ingest: {}".format(str(e)))
            attempt_counter += 1
            if attempt_counter < 1:
                logging.info("Retrying ingest (attempt #{})...".format(str(attempt_counter)))
                sleep(10)
                continue
            else:
                logging.error("Maximum number of retries exceeded. Logging error.")
                status = "Error"
                raise Exception(e)
                
def update_recs_w_file_refs(dataset_id):
    logging.info(f"Processing md5-added files for Dataset ID = {dataset_id}")

    ## Retrieve dataset information
    logging.info("Retrieving necessary information from TDR.")
    src_schema_dict = {}
    api_client = utils.refresh_tdr_api_client()
    datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
    try:
        response = datasets_api.retrieve_dataset(id=dataset_id, include=["SCHEMA", "ACCESS_INFORMATION"]).to_dict()
        src_schema_dict["tables"] = response["schema"]["tables"]
        bq_project = response["access_information"]["big_query"]["project_id"]
        bq_dataset = response["access_information"]["big_query"]["dataset_name"]
    except Exception as e:
        logging.error("Error retrieving information from TDR. Exiting function. Error: {}".format(e))
        return "Failure - Pre-processing"

    ## Parse TDR schema to identify file reference fields
    table_dict = {}
    for table in src_schema_dict["tables"]:
        if table["name"] in ["file_inventory", "anvil_file"]:
            continue
        else:
            col_list = []
            for column in table["columns"]:
                if column["datatype"] == "fileref":
                    col_list.append([column["name"], column["array_of"]])
            if col_list:
                table_dict[table["name"]] = col_list

    ## Loop through tables and re-process impacted records
    for table in table_dict.keys():
        logging.info(f"Processing updates for {table}.")
        # Retrieve relevant records from BigQuery
        col_list = []
        old_cols = ""
        new_cols = ""
        join_clause = ""
        where_clause = ""
        for idx, col in enumerate(table_dict[table]):
            column_name = col[0]
            col_list.append(column_name)
            if idx == 0: 
                old_cols += column_name
                where_clause += f"t.{column_name} IN (SELECT file_ref FROM file_list)"
            else:
                old_cols += ", " + column_name
                where_clause += f" OR t.{column_name} IN (SELECT file_ref FROM file_list)"
            new_cols += f", CASE WHEN t{idx}.source_name IS NOT NULL THEN TO_JSON(STRUCT(t{idx}.source_name AS sourcePath, t{idx}.target_path AS targetPath)) END AS {column_name}"
            join_clause += f" LEFT JOIN load_hist t{idx} ON t.{column_name} = t{idx}.file_id"

        query = """WITH 
            file_list AS (SELECT * FROM `{project}.{dataset}.file_inventory` WHERE md5_hash IS NULL),
            load_hist AS (SELECT * FROM `{project}.{dataset}.datarepo_load_history` WHERE state = 'succeeded')
            SELECT t.* EXCEPT({old_cols}){new_cols}
            FROM `{project}.{dataset}.{table}` t {joins} WHERE {where}""".format(project=bq_project, dataset=bq_dataset, table=table, old_cols=old_cols, new_cols=new_cols, joins=join_clause, where=where_clause)
        try:
            client = bigquery.Client()
            res = client.query(query).result()
            if res.total_rows > 0:
                logging.info(f"{res.total_rows} records to process.")
                df = res.to_dataframe()
                records_json = df.to_json(orient='records')
                records_list = json.loads(records_json)
            else:
                logging.info("No records to process.")
                records_list = []
        except Exception as e:
            logging.error(f"Error retrieving update records from BigQuery: {str(e)}")
            return "Failure - Table Processing"
        # Ingest updated records back to TDR dataset
        try:
            datarepo_row_ids = []
            for record in records_list:
                datarepo_row_ids.append(record.pop("datarepo_row_id", None))
                for col in col_list:
                    record[col] = json.loads(record[col])
            if records_list:
                ingest_updated_records("e0e03e48-5b96-45ec-baa4-8cc1ebf74c61", dataset_id, table, records_list)
                delete_old_records(dataset_id, table, datarepo_row_ids)
        except Exception as e:
            logging.error(f"Error replacing TDR records: {str(e)}")
            return "Failure - Table Processing"
        
    ## Re-process file_inventory
    logging.info(f"Processing updates for file_inventory.")
    # Retrieve relevant records from BigQuery
    query = """WITH 
        file_list AS (SELECT file_ref FROM `{project}.{dataset}.file_inventory` WHERE md5_hash IS NULL),
        load_hist AS (SELECT * FROM `{project}.{dataset}.datarepo_load_history` WHERE state = 'succeeded')
        SELECT t1.*, CASE WHEN t2.source_name IS NOT NULL THEN TO_JSON(STRUCT(t2.source_name AS sourcePath, t2.target_path AS targetPath)) END AS file_ref
        FROM `{project}.{dataset}.file_inventory` t1
          INNER JOIN load_hist t2 ON t1.file_ref = t2.file_id
        WHERE file_ref IN (SELECT file_ref FROM file_list)""".format(project=bq_project, dataset=bq_dataset)
    try:
        client = bigquery.Client()
        res = client.query(query).result()
        if res.total_rows > 0:
            logging.info(f"{res.total_rows} records to process.")
            df = res.to_dataframe()
            records_json = df.to_json(orient='records')
            records_list = json.loads(records_json)
        else:
            logging.info("No records to process.")
            records_list = []
    except Exception as e:
        logging.error(f"Error retrieving update records from BigQuery: {str(e)}")
        return "Failure - File Inventory Processing"
    # Loop through records and update md5_hash from GCS metadata
    try:
        storage_client = storage.Client()
        datarepo_row_ids = []
        for record in records_list:
            bucket = re.match('gs:\/\/([a-z0-9\-]+)', record["uri"]).group(1)
            obj = re.match('gs:\/\/[a-z0-9\-]+\/([A-Za-z0-9\-_\/\.]+)', record["uri"]).group(1)
            bucket = storage_client.bucket(bucket, user_project="anvil-datastorage")
            blob = bucket.get_blob(obj)
            record["md5_hash"] = blob.md5_hash
            datarepo_row_ids.append(record.pop("datarepo_row_id", None))
    except Exception as e:
        logging.error(f"Error retrieving file metadata from GCS: {str(e)}")
        return "Failure - File Inventory Processing"
    # Ingest updated records back to TDR dataset
    try:
        if records_list:
            ingest_updated_records("e0e03e48-5b96-45ec-baa4-8cc1ebf74c61", dataset_id, "file_inventory", records_list)
            delete_old_records(dataset_id, "file_inventory", datarepo_row_ids)         
    except Exception as e:
        logging.error(f"Error replacing TDR records: {str(e)}")
        return "Failure - File Inventory Processing"

    ## Empty anvil_% tables
    logging.info("Clearing out existing anvil_% tables")
    table_list = ["anvil_activity", "anvil_alignmentactivity", "anvil_antibody", "anvil_assayactivity", "anvil_biosample", "anvil_diagnosis", "anvil_donor", "anvil_file", "anvil_sequencingactivity", "anvil_variantcallingactivity"]
    for table in table_list:
        try:
            datarepo_row_ids = collect_all_datarepo_rows(dataset_id, table)
            if datarepo_row_ids:
                delete_old_records(dataset_id, table, datarepo_row_ids)
        except Exception as e:
            logging.error(f"Error clearing out existing anvil_% records: {str(e)}")
            return "Failure - anvil_% Record Deletion"
    
    ## Re-run T pipeline without validation
    params = {}
    params["ws_name"] = ws_name
    params["ws_project"] = ws_project
    params["ws_bucket"] = ws_bucket
    params["ws_bucket_name"] = ws_bucket_name
    params["profile_id"] = "e0e03e48-5b96-45ec-baa4-8cc1ebf74c61" 
    params["mapping_target"] = "anvil"
    params["skip_transforms"] = False
    params["transform_list_override"] = [] # Leave empty to run transforms for all files, otherwise populate with target table names 
    params["skip_schema_extension"] = False
    params["skip_ingests"] = False
    params["ingest_list_override"] = [] # Leave empty to run ingests for all files, otherwise populate with target table names
    params["skip_file_relation_inference"] = False
    params["skip_dangling_fk_resolution"] = False
    params["skip_supplementary_file_identification"] = False
    params["skip_snapshot_creation"] = False
    params["snapshot_readers_list"] = ["azul-anvil-prod@firecloud.org"] # Include "auth-domain" to add the auth domain(s) as a reader (if one exists)
    params["skip_data_validation"] = True
    try:
        api_client = utils.refresh_tdr_api_client()
        datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
        dataset_info = datasets_api.retrieve_dataset(id=dataset_id, include=["SCHEMA", "ACCESS_INFORMATION", "PROPERTIES"]).to_dict()
        dataset_name = dataset_info["name"]
        phs_id = dataset_info["phs_id"]
        consent_name = dataset_info["properties"]["consent_name"]
        auth_domains = dataset_info["properties"]["auth_domains"]
        src_workspaces = dataset_info["properties"]["source_workspaces"]
    except:
        dataset_name = ""
        return "Failure - Dataset Retrieval for T Pipeline"
    if dataset_name:
        params["dataset_id"] = dataset_id
        params["dataset_name"] = dataset_name
        params["phs_id"] = phs_id
        params["consent_name"] = consent_name
        params["auth_domains"] = auth_domains
        utils.run_t_pipeline(params)
    
    # Return success message if no failures recorded
    logging.info("Function completed successfully.")
    return "Success"

# Loop through datasets and process md5 updates
dataset_id_list = [
'700303c2-fcef-48a5-9900-096bf34e2d83',
'a715c70d-da92-43ee-a851-1a27277909a2',
]
results = []
for dataset_id in dataset_id_list:
    status = update_recs_w_file_refs(dataset_id) 
    results.append([dataset_id, status])
    results_df = pd.DataFrame(results, columns = ["dataset_id", "run_status"])
display(results_df)


In [None]:
# # Testing
# dataset_id = 'bc6075ac-5cfe-4613-8601-36ceb614939e'

# logging.info(f"Processing md5-added files for Dataset ID = {dataset_id}")

# ## Retrieve dataset information
# logging.info("Retrieving necessary information from TDR.")
# src_schema_dict = {}
# api_client = utils.refresh_tdr_api_client()
# datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
# try:
#     response = datasets_api.retrieve_dataset(id=dataset_id, include=["SCHEMA", "ACCESS_INFORMATION"]).to_dict()
#     src_schema_dict["tables"] = response["schema"]["tables"]
#     bq_project = response["access_information"]["big_query"]["project_id"]
#     bq_dataset = response["access_information"]["big_query"]["dataset_name"]
# except Exception as e:
#     logging.error("Error retrieving information from TDR. Exiting function. Error: {}".format(e))
#     #return "Failure - Pre-processing"

# ## Parse TDR schema to identify file reference fields
# table_dict = {}
# for table in src_schema_dict["tables"]:
#     if table["name"] in ["file_inventory", "anvil_file"]:
#         continue
#     else:
#         col_list = []
#         for column in table["columns"]:
#             if column["datatype"] == "fileref":
#                 col_list.append([column["name"], column["array_of"]])
#         if col_list:
#             table_dict[table["name"]] = col_list

# ## Loop through tables and re-process impacted records
# for table in table_dict.keys():
#     logging.info(f"Processing updates for {table}.")
#     # Retrieve relevant records from BigQuery
#     col_list = []
#     old_cols = ""
#     new_cols = ""
#     join_clause = ""
#     where_clause = ""
#     for idx, col in enumerate(table_dict[table]):
#         column_name = col[0]
#         col_list.append(column_name)
#         if idx == 0: 
#             old_cols += column_name
#             where_clause += f"t.{column_name} IN (SELECT file_ref FROM file_list)"
#         else:
#             old_cols += ", " + column_name
#             where_clause += f" OR t.{column_name} IN (SELECT file_ref FROM file_list)"
#         new_cols += f", CASE WHEN t{idx}.source_name IS NOT NULL THEN TO_JSON(STRUCT(t{idx}.source_name AS sourcePath, t{idx}.target_path AS targetPath)) END AS {column_name}"
#         join_clause += f" LEFT JOIN load_hist t{idx} ON t.{column_name} = t{idx}.file_id"

#     query = """WITH 
#         file_list AS (SELECT * FROM `{project}.{dataset}.file_inventory` WHERE md5_hash IS NULL),
#         load_hist AS (SELECT * FROM `{project}.{dataset}.datarepo_load_history` WHERE state = 'succeeded')
#         SELECT t.* EXCEPT({old_cols}){new_cols}
#         FROM `{project}.{dataset}.{table}` t {joins} WHERE {where}""".format(project=bq_project, dataset=bq_dataset, table=table, old_cols=old_cols, new_cols=new_cols, joins=join_clause, where=where_clause)
#     try:
#         client = bigquery.Client()
#         res = client.query(query).result()
#         if res.total_rows > 0:
#             logging.info(f"{res.total_rows} records to process.")
#             df = res.to_dataframe()
#             records_json = df.to_json(orient='records')
#             records_list = json.loads(records_json)
#         else:
#             logging.info("No records to process.")
#             records_list = []
#     except Exception as e:
#         logging.error(f"Error retrieving update records from BigQuery: {str(e)}")
#         break
#         #return "Failure - Table Processing"
#     # Ingest updated records back to TDR dataset
#     try:
#         datarepo_row_ids = []
#         for record in records_list:
#             datarepo_row_ids.append(record.pop("datarepo_row_id", None))
#             for col in col_list:
#                 record[col] = json.loads(record[col])
#         if records_list:
#             ingest_updated_records("e0e03e48-5b96-45ec-baa4-8cc1ebf74c61", dataset_id, table, records_list)
#             delete_old_records(dataset_id, table, datarepo_row_ids)
#     except Exception as e:
#         logging.error(f"Error replacing TDR records: {str(e)}")
#         break
#         #return "Failure - Table Processing"

# # ## Re-process file_inventory
# # logging.info(f"Processing updates for file_inventory.")
# # # Retrieve relevant records from BigQuery
# # query = """WITH 
# #     file_list AS (SELECT file_ref FROM `{project}.{dataset}.file_inventory` WHERE md5_hash IS NULL),
# #     load_hist AS (SELECT * FROM `{project}.{dataset}.datarepo_load_history` WHERE state = 'succeeded')
# #     SELECT t1.*, CASE WHEN t2.source_name IS NOT NULL THEN TO_JSON(STRUCT(t2.source_name AS sourcePath, t2.target_path AS targetPath)) END AS file_ref
# #     FROM `{project}.{dataset}.file_inventory` t1
# #       INNER JOIN load_hist t2 ON t1.file_ref = t2.file_id
# #     WHERE file_ref IN (SELECT file_ref FROM file_list)""".format(project=bq_project, dataset=bq_dataset)
# # try:
# #     client = bigquery.Client()
# #     res = client.query(query).result()
# #     if res.total_rows > 0:
# #         logging.info(f"{res.total_rows} records to process.")
# #         df = res.to_dataframe()
# #         records_json = df.to_json(orient='records')
# #         records_list = json.loads(records_json)
# #     else:
# #         logging.info("No records to process.")
# #         records_list = []
# # except Exception as e:
# #     logging.error(f"Error retrieving update records from BigQuery: {str(e)}")
# #     #return "Failure - File Inventory Processing"
# # # Loop through records and update md5_hash from GCS metadata
# # try:
# #     storage_client = storage.Client()
# #     datarepo_row_ids = []
# #     for record in records_list:
# #         bucket = re.match('gs:\/\/([a-z0-9\-]+)', record["uri"]).group(1)
# #         obj = re.match('gs:\/\/[a-z0-9\-]+\/([A-Za-z0-9\-_\/\.]+)', record["uri"]).group(1)
# #         bucket = storage_client.bucket(bucket, user_project="anvil-datastorage")
# #         blob = bucket.get_blob(obj)
# #         record["md5_hash"] = blob.md5_hash
# #         datarepo_row_ids.append(record.pop("datarepo_row_id", None))
# # except Exception as e:
# #     logging.error(f"Error retrieving file metadata from GCS: {str(e)}")
# #     #return "Failure - File Inventory Processing"
# # # Ingest updated records back to TDR dataset
# # try:
# #     if records_list:
# #         ingest_updated_records("e0e03e48-5b96-45ec-baa4-8cc1ebf74c61", dataset_id, "file_inventory", records_list)
# #         delete_old_records(dataset_id, "file_inventory", datarepo_row_ids)         
# # except Exception as e:
# #     logging.error(f"Error replacing TDR records: {str(e)}")
# #     #return "Failure - File Inventory Processing"

# # ## Empty anvil_% tables
# # logging.info("Clearing out existing anvil_% tables")
# # table_list = ["anvil_activity", "anvil_alignmentactivity", "anvil_antibody", "anvil_assayactivity", "anvil_biosample", "anvil_diagnosis", "anvil_donor", "anvil_file", "anvil_sequencingactivity", "anvil_variantcallingactivity"]
# # for table in table_list:
# #     try:
# #         datarepo_row_ids = collect_all_datarepo_rows(dataset_id, table)
# #         if datarepo_row_ids:
# #             delete_old_records(dataset_id, table, datarepo_row_ids)
# #     except Exception as e:
# #         logging.error(f"Error clearing out existing anvil_% records: {str(e)}")
# #         break
# #         #return "Failure - anvil_% Record Deletion"

# # ## Re-run T pipeline without validation
# # params = {}
# # params["ws_name"] = ws_name
# # params["ws_project"] = ws_project
# # params["ws_bucket"] = ws_bucket
# # params["ws_bucket_name"] = ws_bucket_name
# # params["profile_id"] = "e0e03e48-5b96-45ec-baa4-8cc1ebf74c61" 
# # params["mapping_target"] = "anvil"
# # params["skip_transforms"] = False
# # params["transform_list_override"] = [] # Leave empty to run transforms for all files, otherwise populate with target table names 
# # params["skip_schema_extension"] = False
# # params["skip_ingests"] = False
# # params["ingest_list_override"] = [] # Leave empty to run ingests for all files, otherwise populate with target table names
# # params["skip_file_relation_inference"] = False
# # params["skip_dangling_fk_resolution"] = False
# # params["skip_supplementary_file_identification"] = False
# # params["skip_snapshot_creation"] = False
# # params["snapshot_readers_list"] = ["azul-anvil-prod@firecloud.org"] # Include "auth-domain" to add the auth domain(s) as a reader (if one exists)
# # params["skip_data_validation"] = True
# # try:
# #     api_client = utils.refresh_tdr_api_client()
# #     datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
# #     dataset_info = datasets_api.retrieve_dataset(id=dataset_id, include=["SCHEMA", "ACCESS_INFORMATION", "PROPERTIES"]).to_dict()
# #     dataset_name = dataset_info["name"]
# #     phs_id = dataset_info["phs_id"]
# #     consent_name = dataset_info["properties"]["consent_name"]
# #     auth_domains = dataset_info["properties"]["auth_domains"]
# #     src_workspaces = dataset_info["properties"]["source_workspaces"]
# # except:
# #     dataset_name = ""
# #     return "Failure - Dataset Retrieval for T Pipeline"
# # if dataset_name:
# #     params["dataset_id"] = dataset_id
# #     params["dataset_name"] = dataset_name
# #     params["phs_id"] = phs_id
# #     params["consent_name"] = consent_name
# #     params["auth_domains"] = auth_domains
# #     utils.run_t_pipeline(params)

# # Return success message if no failures recorded
# logging.info("Function completed successfully.")
# #return "Success"


In [None]:
# for idx, record in enumerate(records_list):
#     if record["library_2_estimated_library_size"]:
#         print(str(idx) + " - " + str(record["library_2_estimated_library_size"]))

In [None]:
# records_list[50]

# Add new workspace files to the appropriate TDR dataset

## Script to diff file inventories between TDR and source workspaces

In [2]:
#############################################
## Functions
#############################################

def anvil_tdr_file_diff(dataset_id_list, print_queries, write_out_files):

    # Loop through and process datasets
    results = []
    df_detailed_results = pd.DataFrame()
    for dataset_id in dataset_id_list:

        # Retrieve dataset information
        logging.info(f"Processing dataset_id = {dataset_id}...")
        api_client = utils.refresh_tdr_api_client()
        datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
        try:
            logging.info("Retrieving dataset details.")
            dataset_details = datasets_api.retrieve_dataset(id=dataset_id, include=["ACCESS_INFORMATION", "PROPERTIES"]).to_dict()
            dataset_name = dataset_details["name"]
            bq_project = dataset_details["access_information"]["big_query"]["project_id"]
            bq_dataset = dataset_details["access_information"]["big_query"]["dataset_name"]
            try:
                source_workspace = dataset_details["properties"]["source_workspaces"][0]
            except:
                source_workspace = ""
        except Exception as e:
            error_message = f"Error retrieving dataset details: {str(e)}"
            logging.error(error_message)
            results.append([dataset_id, "", 0, 0, "Error", error_message])

        # Build and execute aggregate query
        logging.info("Building and executing aggregate file diff query.")
        client = bigquery.Client()
        query = f"""WITH files_all_workspaces
                    AS
                    (
                      SELECT file_path
                      FROM `broad-dsde-prod-analytics-dev.anvil_inventory_uscentral1.ref_object_inventory_20240405`
                      WHERE file_path NOT LIKE '%/'
                    ),
                    files_source_workspace
                    AS
                    (
                      SELECT file_path
                      FROM `broad-dsde-prod-analytics-dev.anvil_inventory_uscentral1.ref_object_inventory_20240405`
                      WHERE workspace_name = '{source_workspace}'
                      AND billing_project = 'anvil-datastorage'
                      AND file_path NOT LIKE '%/'
                    ),
                    files_tdr
                    AS
                    (
                      SELECT uri AS file_path
                      FROM `{bq_project}.{bq_dataset}.file_inventory`  
                    )
                    SELECT COUNT(CASE WHEN t0.file_path IS NOT NULL THEN 1 END) AS tdr_files,
                    COUNT(CASE WHEN t0.file_path IS NOT NULL AND t1.file_path IS NULL THEN 1 END) AS tdr_files_not_in_inv,
                    COUNT(CASE WHEN t0.file_path IS NULL AND t2.file_path IS NOT NULL THEN 1 END) AS inv_files_not_in_tdr,
                    COUNT(CASE WHEN t0.file_path IS NULL AND t2.file_path IS NOT NULL AND t2.file_path NOT LIKE '%SubsetHailJointCall%' AND t2.file_path NOT LIKE '%.vds/%' THEN 1 END) AS non_vds_inv_files_not_in_tdr,
                    COUNT(CASE WHEN t2.file_path IS NOT NULL AND t2.file_path LIKE '%SubsetHailJointCall%' AND t2.file_path LIKE '%.vcf%' THEN 1 END) AS joint_call_vcf_files
                    FROM files_tdr t0 
                      LEFT JOIN files_all_workspaces t1 ON t0.file_path = t1.file_path  
                      FULL JOIN files_source_workspace t2 ON t0.file_path = t2.file_path"""
        if print_queries:
            print("Aggregate file diff query:")
            print(query)
        try:
            df = client.query(query).result().to_dataframe()
            tdr_files = df["tdr_files"].values[0]
            tdr_files_not_in_inv = df["tdr_files_not_in_inv"].values[0]
            inv_files_not_in_tdr = df["inv_files_not_in_tdr"].values[0] 
            non_vds_inv_files_not_in_tdr = df["non_vds_inv_files_not_in_tdr"].values[0]
            joint_call_vcf_files = df["joint_call_vcf_files"].values[0]
            results.append([dataset_id, dataset_name, tdr_files, tdr_files_not_in_inv, inv_files_not_in_tdr, non_vds_inv_files_not_in_tdr, joint_call_vcf_files, "Success", ""])
        except Exception as e:
            error_message = f"BigQuery error: {str(e)}"
            logging.error(error_message)
            results.append([dataset_id, dataset_name, 0, 0, 0, 0, 0, "Error", error_message]) 
        
        # Build and execute details query
        if write_out_files:
            logging.info("Building and executing detailed file diff query.")
            client = bigquery.Client()
            query = f"""WITH files_all_workspaces
                        AS
                        (
                          SELECT file_path
                          FROM `broad-dsde-prod-analytics-dev.anvil_inventory_uscentral1.ref_object_inventory_20240405`
                          WHERE file_path NOT LIKE '%/'
                        ),
                        files_source_workspace
                        AS
                        (
                          SELECT file_path
                          FROM `broad-dsde-prod-analytics-dev.anvil_inventory_uscentral1.ref_object_inventory_20240405`
                          WHERE workspace_name = '{source_workspace}'
                          AND billing_project = 'anvil-datastorage'
                          AND file_path NOT LIKE '%/'
                        ),
                        files_tdr
                        AS
                        (
                          SELECT uri AS file_path
                          FROM `{bq_project}.{bq_dataset}.file_inventory`  
                        )
                        SELECT '{dataset_id}' AS dataset_id, 'tdr_files_not_in_inv' AS metric, t0.file_path
                        FROM files_tdr t0 
                          LEFT JOIN files_all_workspaces t1 ON t0.file_path = t1.file_path  
                          FULL JOIN files_source_workspace t2 ON t0.file_path = t2.file_path
                        WHERE t0.file_path IS NOT NULL AND t1.file_path IS NULL
                        UNION ALL
                        SELECT '{dataset_id}' AS dataset_id, 'non_vds_inv_files_not_in_tdr' AS metric, t2.file_path
                        FROM files_tdr t0 
                          LEFT JOIN files_all_workspaces t1 ON t0.file_path = t1.file_path  
                          FULL JOIN files_source_workspace t2 ON t0.file_path = t2.file_path
                        WHERE t0.file_path IS NULL AND t2.file_path IS NOT NULL AND t2.file_path NOT LIKE '%SubsetHailJointCall%' AND t2.file_path NOT LIKE '%.vds/%'"""
            if print_queries:
                print("Detailed file diff query:")
                print(query)
            try:
                df = client.query(query).result().to_dataframe()
                df_detailed_results = pd.concat([df_detailed_results, df])
            except Exception as e:
                error_message = f"BigQuery error: {str(e)}"
                logging.error(error_message)

    # Write out detailed results, if specified
    if write_out_files:
        destination_dir = "ingest_pipeline/resources/file_inventory_diff/details"
        current_datetime_string = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
        output_file = f"file_diffs_{current_datetime_string}.tsv"
        logging.info(f"Writing out detailed file diff results out to {ws_bucket}/{destination_dir}/{output_file}")
        df_detailed_results.to_csv(output_file, index=False, sep="\t")
        !gsutil cp $output_file $ws_bucket/$destination_dir/ 2> stdout
        !rm $output_file
    
    # Display results
    print("Aggregate Results:")
    results_df = pd.DataFrame(results, columns = ["dataset_id", "dataset_name", "tdr_files", "tdr_files_not_in_inv", "inv_files_not_in_tdr", "non_vds_inv_files_not_in_tdr", "joint_call_vcf_files", "status", "message"])
    display(results_df)
        
#############################################
## Input Parameters
#############################################

# List of datasets to process
dataset_id_list = [
    'd48adc59-8934-41bb-9720-63e71f1933be',
    '80baf71d-28d0-4bca-81b7-49ddfadfa7a3',
]

# Variable to enable query printing, if desired
print_queries = False

# Variable to output files in addition to aggregation
write_out_files = False

#############################################
## Execution
#############################################

anvil_tdr_file_diff(dataset_id_list, print_queries, write_out_files)


04/10/2024 02:46:15 PM - INFO: Processing dataset_id = d48adc59-8934-41bb-9720-63e71f1933be...
04/10/2024 02:46:15 PM - INFO: Retrieving dataset details.
04/10/2024 02:46:15 PM - INFO: Building and executing aggregate file diff query.
04/10/2024 02:46:20 PM - INFO: Processing dataset_id = 80baf71d-28d0-4bca-81b7-49ddfadfa7a3...
04/10/2024 02:46:20 PM - INFO: Retrieving dataset details.
04/10/2024 02:46:20 PM - INFO: Building and executing aggregate file diff query.
Aggregate Results:


Unnamed: 0,dataset_id,dataset_name,tdr_files,tdr_files_not_in_inv,inv_files_not_in_tdr,non_vds_inv_files_not_in_tdr,joint_call_vcf_files,status,message
0,d48adc59-8934-41bb-9720-63e71f1933be,ANVIL_1000G_PRIMED_data_model_20240410,11493,23,0,0,0,Success,
1,80baf71d-28d0-4bca-81b7-49ddfadfa7a3,ANVIL_GTEx_v10_hg38_20240410,153984,28135,0,0,0,Success,


## Script to identify specific files that haven't been ingested into TDR

In [None]:
#############################################
## Functions
#############################################

def identify_additional_files(dataset_id_list, file_exclusions, output_dir):
    
    # Loop through and process dataset_ids
    logging.info("Starting identify_additional_files function...")
    agg_results = []
    for dataset_id in dataset_id_list:
        result = [dataset_id, "Failure", 0, 0, 0]
        try:
            # Retrieve dataset details
            logging.info(f"Processing dataset_id {dataset_id}...")
            logging.info("Retrieving dataset details.")
            api_client = utils.refresh_tdr_api_client()
            datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
            dataset_details = datasets_api.retrieve_dataset(id=dataset_id, include=["ACCESS_INFORMATION", "PROPERTIES"]).to_dict()
            try:
                source_workspaces = dataset_details["properties"]["source_workspaces"]
                bq_project = dataset_details["access_information"]["big_query"]["project_id"]
                bq_schema = dataset_details["access_information"]["big_query"]["dataset_name"]
            except Exception as e:
                print("Failure - Issue Retrieving Dataset Info") 
                continue

            # Use source workspace(s) to find workspace bucket(s) to look for new files
            logging.info("Determining source workspace bucket(s).")
            data_files_src_buckets = {}
            for ws in source_workspaces:
                try:
                    ws_attributes = utils.get_workspace_attributes("anvil-datastorage", ws)
                    src_bucket = ws_attributes["bucketName"] if ws_attributes.get("bucketName") else ""
                    if not src_bucket:
                        print("Failure - Issue Retrieving Source Buckets")
                        continue
                    elif src_bucket not in data_files_src_buckets:
                        data_files_src_buckets[src_bucket] = {
                            "include_dirs": [],
                            "exclude_dirs": []
                        }
                except Exception as e:
                    print("Failure - Issue Retrieving Source Buckets")
                    continue

            # Pull existing file inventory from BigQuery
            logging.info("Pulling existing file inventory records.")
            client = bigquery.Client()
            query = """SELECT uri FROM `{project}.{schema}.file_inventory`""".format(project = bq_project, schema = bq_schema)
            file_list = []
            try:
                output = client.query(query).result()
                if output.total_rows > 0:
                    for row in output:
                        file_list.append(row.uri)
            except Exception as e:
                print("Failure - Issue Retrieving Existing File Inventory Records")
                continue

            # Build file inventory from workspace bucket(s)
            logging.info("Building new file inventory.")
            ws_attributes = utils.get_workspace_attributes(ws_project, ws_name)
            params = {}
            params["data_files_src_buckets"] = data_files_src_buckets
            params["google_project"] = ws_attributes["googleProject"]
            params["file_inventory_dir"] = output_dir
            params["global_file_exclusions"] = []
            inventory, retry_count = bfi.build_inventory(params)

            # Diff files to ingest and collect summary stats
            logging.info("Diffing new and existing file inventory records.")
            full_diff_list = []
            exclude_list = []
            include_list = []
            for file in inventory:
                file_excluded = False
                if file["uri"] not in file_list:
                    full_diff_list.append(file)
                    for exclude_term in file_exclusions:
                        if exclude_term in file["uri"]:
                            exclude_list.append(file)
                            file_excluded = True
                            break
                    if not file_excluded:
                        include_list.append(file)
            new_file_cnt = len(full_diff_list)
            new_exclusion_file_cnt = len(exclude_list)
            new_non_exclusion_file_cnt = len(include_list)
            result = [dataset_id, "Success", new_file_cnt, new_exclusion_file_cnt, new_non_exclusion_file_cnt]

            # Record diff files and write out to tsv 
            if len(include_list) > 0:
                logging.info("Writing out inclusion results.")
                df_inventory = pd.DataFrame(include_list)
                destination_dir = "ingest_pipeline/resources/file_inventory_diff/output"
                output_file = f"file_inventory_{dataset_id}.tsv"
                logging.info(f"Writing inclusion results out to {ws_bucket}/{destination_dir}/{output_file}")
                df_inventory.to_csv(output_file, index=False, sep="\t")
                !gsutil cp $output_file $ws_bucket/$destination_dir/ 2> stdout
                !rm $output_file
            else:
                logging.info("No inclusion results to write out.")
        except:
            logging.info("Unspecified error.")
            
        # Write out agg_results
        agg_results.append(result)
        with open("file_diff_out.csv", "a", newline="") as f:
            writer = csv.writer(f)
            writer.writerow(result)

    # Display results
    logging.info("Function 'identify_additional_files' finished successfully.")
    logging.info("\nResults:")
    df_agg_results = pd.DataFrame(agg_results, columns =["Dataset ID", "Status", "New Files", "New Files (Exclusion List)", "New Files (Non-Exclusion List)"])
    display(df_agg_results)

#############################################
## Input Parameters
#############################################

# List of dataset IDs to analyze
dataset_id_list = [
    '8da05494-fe7a-4af5-b257-bada143ee426',
    '8e88cabc-e713-44ed-a5d2-41935c3b4eb5',
    'be8cfc23-cd19-46fb-92e1-a77ac380d7aa',
    'f9224ea2-dd31-421d-80d4-f35082ef8d68',
    '487016d8-ea02-4b20-a45f-7382139aa865',
    'eb35085f-0cbf-4829-a3ad-acaa53a250b5',
    '7577f264-8e84-440d-9346-7c4d5affda51',
    'febd8561-4769-4f3b-b7c0-ae7ff6ede2e9',
    'b8c5b185-8669-43d1-8ec7-c0f6d223d505',
    '166746e8-ce26-4fa1-a587-443ca9fc59a1',
    '49a97523-0a7a-4d5a-ae20-496f86de2032',
    '583023a1-aa12-40e2-a964-8ad50ad400ba',
    '73f7d2b4-86ec-4f7e-a1f9-37c7b023e3bf',
]

# List of file exclusions to apply
file_exclusions = ["SubsetHailJointCall", ".vds/"]

# Output directory
output_dir = "ingest_pipeline/resources/file_inventory_diff/output"

#############################################
## Execution
#############################################

identify_additional_files(dataset_id_list, file_exclusions, output_dir)


## Script to ingest missing workspace files into the appropriate TDR dataset

In [None]:
#############################################
## Functions
#############################################

def ingest_additional_files(dataset_id_list, file_inventory_dir):
    
    # Loop through and process datasets
    results = []
    for dataset_id in dataset_id_list:

        # Retrieve dataset details
        logging.info(f"Processing dataset_id {dataset_id}...")
        try:
            logging.info("Retrieving dataset details.")
            api_client = utils.refresh_tdr_api_client()
            datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
            dataset_details = datasets_api.retrieve_dataset(id=dataset_id, include=["ACCESS_INFORMATION", "PROPERTIES"]).to_dict()
            try:
                source_workspaces = dataset_details["properties"]["source_workspaces"]
            except:
                error_message = "No source workspace found on dataset."
                logging.info(error_message) 
                results.append([dataset_id, "Failure", error_message])
        except Exception as e:
            error_message = "Issue Retrieving Dataset Info"
            logging.info(error_message) 
            results.append([dataset_id, "Failure", error_message])
            continue

        # Read in file inventory
        logging.info("Reading in file inventory, if exists.")
        try: 
            file_inventory_name = f"file_inventory_{dataset_id}.tsv"
            inventory_file_path = "gs://" + ws_bucket_name + "/" + file_inventory_dir + "/" + file_inventory_name
            df_inv = pd.read_csv(inventory_file_path, delimiter = "\t")
            df_inv["file_ref"] = df_inv.apply(lambda x: json.loads(x["file_ref"].replace("\'", "\"")), axis=1)
            df_inv = df_inv.replace(np.nan, None)
            file_inventory = df_inv.to_dict(orient='records')
            logging.info("File inventory populated successfully.")
        except Exception as e:
            error_message = "File inventory not populated. Unable to populate from file: {}".format(e)
            logging.info(error_message)
            results.append([dataset_id, "Failure", error_message])
            continue

        # Build, submit, and monitor ingest request
        logging.info("Building and submitting ingest request.")
        ingest_request = {
            "table": "file_inventory",
            "profile_id": "e0e03e48-5b96-45ec-baa4-8cc1ebf74c61" ,
            "ignore_unknown_values": True,
            "resolve_existing_files": True,
            "updateStrategy": "replace",
            "format": "array",
            "bulkMode": True,
            "load_tag": f"Ingest for {source_workspaces[0]}",
            "records": file_inventory
        }
        attempt_counter = 0
        while True:
            try:
                api_client = utils.refresh_tdr_api_client()
                datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
                ingest_request_result, job_id = utils.wait_for_tdr_job(datasets_api.ingest_dataset(id=dataset_id, ingest=ingest_request))
                logging.info("Ingest succeeded: {}".format(str(ingest_request_result)[0:1000]))
                results.append([dataset_id, "Success", None])
                break
            except Exception as e:
                logging.error("Error on ingest: {}".format(str(e)))
                attempt_counter += 1
                if attempt_counter < 1:
                    logging.info("Retrying ingest (attempt #{})...".format(str(attempt_counter)))
                    sleep(10)
                    continue
                else:
                    logging.error("Maximum number of retries exceeded. Logging error.")
                    results.append([dataset_id, "Failure", str(e)])
                    break

    # Display results
    logging.info("\nResults:")
    df_results = pd.DataFrame(results, columns =["Dataset ID", "Status", "Message"])
    display(df_results)

#############################################
## Input Parameters
#############################################

# List of dataset IDs to analyze
dataset_id_list = [
    '1c2fe11d-b020-4c54-8c71-1ea91623d626',
    '902596ce-714e-49b3-8271-f3dfece52309',
    '18b1a7a4-1724-4e10-95ca-fa35164c4801',
    '63b229b5-e7c8-4fd3-bbc8-ecf344da70d4',
    '352a503b-41eb-4a84-b257-68d70e55337e',
    '737d39b8-2f99-4eac-bcda-a03996e08939',
    'b8c5b185-8669-43d1-8ec7-c0f6d223d505',
    '31e61d00-61cc-46f2-a793-8ea8dfbb0832',
    '9737abab-2d09-4912-b300-f32553bda82c',
    'c56f0a76-2b91-4860-8dff-63c9504bb0e2',
    '732eaae3-b509-4a7a-8961-09d861e55253',
    'b5d7c34a-c383-4fc7-aa4d-b6dc941cd41a',
    'bcfe7f3b-3e63-45de-9e4d-144f9fc63753',
]

# File inventory directory
file_inventory_dir = "ingest_pipeline/resources/file_inventory_diff/output"


#############################################
## Execution
#############################################

ingest_additional_files(dataset_id_list, file_inventory_dir)


## Script to soft-delete tabular data records for files deleted at the source

In [None]:
#############################################
## Functions
#############################################

# Function to delete rows from a dataset
def delete_datarepo_rows(dataset_id, table_name, datarepo_row_ids):
    logging.info("Attempting to delete specified rows from {} for dataset {}".format(table_name, dataset_id))
    if datarepo_row_ids:
        data_deletion_payload = {
            "deleteType": "soft",
            "specType": "jsonArray",
            "tables": [{
              "tableName": table_name,
              "jsonArraySpec": {
                "rowIds": datarepo_row_ids
              }
            }]
        }
        try:
            api_client = utils.refresh_tdr_api_client()
            datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
            data_deletion_result, job_id = utils.wait_for_tdr_job(datasets_api.apply_dataset_data_deletion(id=dataset_id, data_deletion_request=data_deletion_payload))
            logging.info("Result: {}".format(data_deletion_result))
            return "Success"
        except Exception as e:
            logging.info("Error: {}".format(str(e)))
            return "Failure"
    else:
        logging.info("No datarepo_row_ids specified for deletion.")
        return "Success"

# Function to evaluate and potentially remove deleted files
def remove_deleted_files(dataset_id, file_uri_list):
    # Retrieve dataset information
    logging.info(f"Processing dataset_id = {dataset_id}...")
    api_client = utils.refresh_tdr_api_client()
    datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
    try:
        logging.info("Retrieving dataset details.")
        dataset_details = datasets_api.retrieve_dataset(id=dataset_id, include=["ACCESS_INFORMATION", "PROPERTIES", "SCHEMA"]).to_dict()
        bq_project = dataset_details["access_information"]["big_query"]["project_id"]
        bq_dataset = dataset_details["access_information"]["big_query"]["dataset_name"]
        fileref_col_dict = {}
        key_col_dict = {}
        for table_entry in dataset_details["schema"]["tables"]:
            if table_entry["name"] != "file_inventory" and "anvil_" not in table_entry["name"]:
                fileref_list = []
                for idx, column_entry in enumerate(table_entry["columns"]):
                    if idx == 0:
                        key_col_dict[table_entry["name"]] = column_entry["name"]
                    if column_entry["datatype"] == "fileref":
                        fileref_list.append(column_entry["name"])
                if fileref_list:
                    fileref_col_dict[table_entry["name"]] = fileref_list
    except Exception as e:
        error_message = f"Error retrieving dataset details: {str(e)}"
        logging.error(error_message)
        results.append([dataset_id, "Error", error_message])

    # Retrieving file_inventory records
    logging.info("Fetching file_inventory records associated with the files to remove.")
    bad_row_ids = set()
    bad_file_refs = set()
    max_page_size = 1000
    total_records_fetched = 0
    total_bad_records = -1
    filter_string = "uri in ('" + "', '".join(file_uri_list) + "')"
    attempt_counter = 0
    while True:
        offset = total_records_fetched
        if total_bad_records == -1:
            page_size = max_page_size
        else:
            page_size = min(max_page_size, total_bad_records - total_records_fetched)
        attempt_counter = 0
        while True:
            payload = {
              "offset": offset,
              "limit": max_page_size,
              "sort": "datarepo_row_id",
              "direction": "asc",
              "filter": filter_string
            }
            try:
                record_results = datasets_api.query_dataset_data_by_id(id=dataset_id, table="file_inventory", query_data_request_model=payload).to_dict()
                break
            except Exception as e:
                if attempt_counter < 2:
                    sleep(10)
                    attempt_counter += 1
                    continue
                else:
                    logging.error("Error retrieving file_inventory records")
                    break
        if record_results.get("result"):
            total_bad_records = record_results["filtered_row_count"]
            total_records_fetched += len(record_results["result"])
            for result_entry in record_results["result"]:
                bad_row_ids.add(result_entry["datarepo_row_id"])
                bad_file_refs.add(result_entry["file_ref"]) 
        else:
            break
        if total_records_fetched >= total_bad_records:
            break

    # Loop through tables with filerefs and look for bad file references
    bad_records_dict = {}
    for table in fileref_col_dict.keys():

        # Build filter string
        logging.info(f"Checking the '{table}' table for bad file references.")
        filter_string = ""
        for idx, field in enumerate(fileref_col_dict[table]):
            base_filter_string = f"{field} in ('" + "', '".join(list(bad_file_refs)) + "')"
            if idx == 0:
                filter_string += f"{base_filter_string}"
            else:
                filter_string += f"OR {base_filter_string}"

        # Find problematic records and record information
        bad_records = []
        max_page_size = 1000
        total_records_fetched = 0
        total_bad_records = -1
        attempt_counter = 0
        while True:
            offset = total_records_fetched
            if total_bad_records == -1:
                page_size = max_page_size
            else:
                page_size = min(max_page_size, total_bad_records - total_records_fetched)
            attempt_counter = 0
            while True:
                payload = {
                  "offset": offset,
                  "limit": max_page_size,
                  "sort": "datarepo_row_id",
                  "direction": "asc",
                  "filter": filter_string
                }
                try:
                    record_results = datasets_api.query_dataset_data_by_id(id=dataset_id, table=table, query_data_request_model=payload).to_dict()
                    break
                except Exception as e:
                    if attempt_counter < 2:
                        sleep(10)
                        attempt_counter += 1
                        continue
                    else:
                        logging.error(f"Error retrieving records from table '{table}'")
                        break
            if record_results.get("result"):
                total_bad_records = record_results["filtered_row_count"]
                total_records_fetched += len(record_results["result"])
                for result_entry in record_results["result"]:
                    bad_record = False
                    bad_col_list = []
                    for field in fileref_col_dict[table]:
                        if result_entry[field] in bad_file_refs:
                            bad_record = True
                            bad_col_list.append(field)
                    if bad_record:
                        key_val = result_entry[key_col_dict[table]]
                        bad_record_detail = key_val + ": " + ", ".join(bad_col_list)
                        bad_records.append(bad_record_detail)
            else:
                break
            if total_records_fetched >= total_bad_records:
                break

        # Record results
        if bad_records:
            bad_records_dict[table] = bad_records

    # If bad records outside of file_inventory are identified, report them out, otherwise delete bad file_inventory records
    tabular_data_results = []
    if bad_records_dict:
        logging.info("Tabular data records with references to bad files found. Please review the output and correct. Will NOT soft-delete the records from the file_inventory table.")
        for key, val in bad_records_dict.items():
            tabular_data_results.append([dataset_id, key, key_col_dict[key], val])
        logging.info("Tabular data records with references to bad files:")
        tabular_results_df = pd.DataFrame(tabular_data_results, columns = ["dataset_id", "table", "key_column", "key_vals_w_affected_cols"])
        display(tabular_results_df)    
    else:
        logging.info("No tabular data records with references to bad files found. Soft-deleting bad records from file_inventory.")
        delete_result = delete_datarepo_rows(dataset_id, "file_inventory", list(bad_row_ids)) 
    
#############################################
## Input Parameters
#############################################

# Dataset to update
dataset_id = "4b456e27-e78f-4ced-a6a1-887f2539ddbe"

# List of file URIs to remove from the dataset
file_uri_list = [

]

#############################################
## Execution
#############################################

remove_deleted_files(dataset_id, file_uri_list)


## Script to Examine VDS Dataset Files

In [4]:
def compare_vds_files(bucket):
    
    # Execute query
    client = bigquery.Client()
    query = f"""
            WITH jointcall_files
            AS
            (
              SELECT REGEXP_EXTRACT(name, r'.*\.vds(.+)$') AS object_name, md5Hash, size
              FROM `broad-dsde-prod-analytics-dev.anvil_inventory.object_metadata_26_02_2024__17_14_55` 
              WHERE bucket IN ('fc-secure-9e3357c0-389c-41d7-94ee-56673db6b75f', 'fc-secure-7e69c896-d6c0-4a4e-8490-42cb2d4fdebf')
            ), 
            cohort_ws_files
            AS
            (
              SELECT REGEXP_EXTRACT(name, r'.*\.vds(.+)$') AS object_name, md5Hash, size
              FROM `broad-dsde-prod-analytics-dev.anvil_inventory.object_metadata_26_02_2024__17_14_55` 
              WHERE bucket = '{bucket}'
              AND name LIKE '%.vds%'
            )
            SELECT COUNT(DISTINCT c.object_name) AS total_vds_file_count, 
            COUNT(DISTINCT j.object_name) AS vds_file_in_jointcall_ws_count
            FROM cohort_ws_files c
              LEFT JOIN jointcall_files j
              ON c.object_name = j.object_name 
              AND c.md5Hash = j.md5Hash
              AND c.size = j.size"""
    try:
        df = client.query(query).result().to_dataframe()
        total_vds_file_count = df["total_vds_file_count"].values[0]
        vds_file_in_jointcall_ws_count = df["vds_file_in_jointcall_ws_count"].values[0] 
        diff = total_vds_file_count - vds_file_in_jointcall_ws_count
        return "Success", total_vds_file_count, vds_file_in_jointcall_ws_count, diff
    except Exception as e:
        return "Failure", 0, 0, 0

# Loop through datasets and validate is_supplementary field
bucket_list = [
    'fc-secure-f5d884c0-a24c-46e6-8c29-cad7f5b158c7',
    'fc-secure-6513d7e1-2dbb-41a2-baea-3f7fdbcbb620',
    'fc-d3e9eb24-cb19-47d8-b2c6-d85fd34b4ff1',
    'fc-0ed1ef2d-1039-4c8a-a0a9-91c3e385200a',
    'fc-282a8e0b-df88-42de-9059-2b7447d9f9c7',
    'fc-secure-5efb4966-0994-41f8-a911-1d159c9bae1b',
    'fc-2836a560-113a-4239-acab-5cce58019b73',
    'fc-bb71bb7a-fdb1-427a-9e56-eb08b6fd7955',
    'fc-secure-e9b2e26a-3f73-4f5a-862f-c5b3be68703f',
    'fc-e7051891-25c8-4776-80ed-26b1af860277',
    'fc-4f070061-0bc2-4f9a-9fe9-869a739c9817',
    'fc-c1701683-c10e-4f73-a636-f774e8b650c2',
    'fc-secure-ccca1171-d3ee-42b3-8df8-aca336279cf3',
    'fc-2d61b7df-571f-4201-a674-1107c84711df',
    'fc-5d7cf59f-e361-4073-a6ad-16d8d78cc613',
    'fc-secure-70487b95-e89c-45ec-ad0a-e5382d625c33',
    'fc-2b68ae78-57af-4c65-8020-6f5ed4ae9408',
    'fc-secure-33bdfbdb-de58-474e-8591-dad501aa1995',
    'fc-2bcebe36-5d83-486a-947a-bbb5a606701d',
    'fc-secure-1fb85b31-9a1e-46ef-a206-41040d151f94',
    'fc-secure-ead0ff8d-eee9-4299-bb54-8404ffe9fa22',
    'fc-secure-d8de1fe3-972d-480f-a8a8-2bbc251add30',
    'fc-secure-4d47049d-9a31-435d-8c97-61cffce9a83b',
    'fc-secure-31d85e96-7fa0-4c2e-a89a-fe5c70845fd7',
    'fc-secure-68ff7cc9-274c-45d6-baad-75b9c5971a9c',
    'fc-secure-fdaa7a52-520b-461b-a2d2-e31bf92e8e86',
    'fc-secure-9c348df7-4da1-428a-a785-e06db3a9f208',
    'fc-secure-0de89e54-2149-4e06-81f9-da5af48c68a3',
    'fc-secure-986229e0-ac72-420d-bf0e-aa14dea63a05',
    'fc-secure-4931149d-9e71-4865-9f41-3e4c998ffb38',
    'fc-secure-13597242-de35-44e2-b8fb-b5fa0b983501',
    'fc-secure-75f95e44-299f-4666-bed3-46dd679b12d8',
    'fc-secure-240e1629-6d73-42ab-a373-1abeec17824c',
    'fc-secure-94c90c12-376d-419a-96d9-ed37e1b1a5bb',
    'fc-secure-6c21e787-1a4b-4235-b756-9ce6096fc815',
    'fc-secure-51198b17-37ae-44b7-8513-c11c4bfe3a9d',
    'fc-secure-59794551-d924-4ad7-905b-8727646d9aad',
    'fc-secure-8ec82876-176f-4f33-ae98-0a3cae871ed4',
    'fc-secure-8a282388-3c56-48c6-99c8-ea4b52c053b9',
    'fc-secure-3e71d768-9da9-4845-9e2c-7e909db92cb7',
    'fc-secure-0fc5a889-f57e-40a1-9859-c5b1e8a196d1',
    'fc-secure-0f948ad2-2ae8-433c-9f0c-941c4c5e4a89',
    'fc-secure-678eccb8-3463-4a72-8b57-69dfc8c77002',
    'fc-secure-221b863c-a724-42f3-9f90-2081b352799c',
    'fc-secure-21cd882f-8470-4c2e-93dc-536a908bae73',
    'fc-secure-e92b8081-5e6a-440c-af83-4d428f505529',
    'fc-secure-e0034430-99a3-4dde-99d3-a2330cd90f19',
    'fc-secure-315a127f-649d-4928-b4e0-cdca7d898e05',
    'fc-secure-550ffe2e-04fd-4763-b7d2-09f0c59083e4',
    'fc-secure-84e57da6-4df9-45de-9f82-8a550887a7fa',
    'fc-secure-2b4d5d05-d951-4e51-8ece-7e851660f91a',
    'fc-secure-bfe6497b-69a1-4917-8a7b-c9bd36cb4ae4',
    'fc-secure-3b588f92-0298-4ad6-b75d-fa16de8b718d',
    'fc-secure-fe950bf8-0470-4329-b8c9-8a42d0dd619d',
    'fc-secure-59d2af1f-3dc0-407b-b7ab-05cdcfa4da8f',
    'fc-secure-01106611-a0e9-41bb-ac13-27683ab2fc19',
    'fc-secure-7a160245-84eb-4383-80ed-f41c2411e702',
    'fc-secure-32a2f8aa-4f72-43e9-9450-bbf661bde5ef',
    'fc-secure-ce2baa61-748a-4dbc-a929-f256721b59b2',
    'fc-secure-ac202043-c5ef-4fb7-8ccb-62a274c1b8ec',
    'fc-secure-652024de-0ecd-4de3-8360-c8c5bfcafd72',
    'fc-secure-7c845669-3781-4ac0-bb59-1495d68d1d85',
    'fc-secure-330f768f-83c4-4570-ae46-0626b477d2b0',
    'fc-secure-dccff364-c2ff-42df-8c8e-f979a0472c11',
    'fc-secure-17d8dbc9-d1d8-4d5d-8eb7-c1b82bef24d8',
    'fc-secure-674fbd89-9eeb-4e43-8a6f-97d6e50708e0',
    'fc-secure-9f2f0267-2df4-44e9-a6ae-dd1d3a43cca5',
    'fc-secure-6bc832d1-a35b-4676-bf68-a5772e2be044',
    'fc-secure-e4b45d7c-3fee-479f-83e9-8c85312cb8da',
    'fc-secure-516245cb-7dcc-487d-acf7-43e5fb10085f',
    'fc-secure-ab235723-ed31-4242-b5ab-23c177a0e79c',
    'fc-secure-b9906df4-3012-4c7b-a008-3c5708885971',
    'fc-secure-05e511c4-0b47-41a5-a361-99f747cbef6c',
    'fc-secure-adba6cb8-c49c-405b-af7b-9980e4a9d36a',
    'fc-secure-98a7c433-bacc-44fd-96f6-faed04dd1c96',
    'fc-secure-fd756575-ba39-4893-8b85-b6dfbb376f3b',
    'fc-secure-a473f80e-97a6-4c19-bd68-e37266efb44d',
    'fc-secure-04e82709-08e0-4335-aaef-ba55089f6fd9',
    'fc-secure-ed823158-2149-493f-80d0-ff066cb14a85',
    'fc-secure-bcc5d428-aed0-4814-aefc-f717b97d5106',
    'fc-secure-de72ef13-9b7f-44db-9428-5df489d327ce',
    'fc-secure-35c81df8-8bdf-467a-af6f-fb807185b82e',
    'fc-secure-39458ab6-c2d3-49e2-b6d5-8bb3bae9a245',
    'fc-secure-124c02b9-69b7-468c-b3d7-4a07aee74dc5',
    'fc-secure-a065288d-5bb4-441c-95e9-0ffb20a6cf40',
    'fc-secure-ee694ec4-cb3d-441d-95f7-e6d586419484',
    'fc-secure-bd923846-0b8b-4018-8706-44b2a8e213b4',
    'fc-secure-4c07a18a-8c79-4b81-acbe-91083298f1e4',
    'fc-secure-21c6905e-06c8-45f2-b6ed-ffba467f7f75',
    'fc-secure-538d85ea-c436-43f9-b001-4db614ed96bf',
    'fc-secure-d87970dd-adb0-4b99-a204-ae6fbd457d12',
    'fc-secure-5f916770-fded-4540-b4b6-49f88b8e05fc',
    'fc-secure-0aedc988-3736-496c-b7ac-20cca5b3ceb9',
    'fc-secure-d4bead53-0db1-4e25-87da-c02be5819368',
    'fc-secure-86cbdfa9-cbc0-40fb-adfa-3dd467ae1062',
    'fc-secure-d157fd3c-57ff-4640-a084-cecda832e575',
    'fc-secure-08bb70e6-9fa1-40dc-8822-41d73945c053',
    'fc-secure-6a2f53f1-6712-48a9-a7b2-3289b8df877b',
    'fc-secure-55225e12-ec4c-42e0-a5d1-986c87c6d129',
    'fc-secure-89bba08d-ef3b-47bb-9c9b-a937d7550a97',
    'fc-secure-bf34568b-1c38-4c43-8a21-59630b969553',
    'fc-secure-8a297961-e042-4d02-826f-0322b3d7fbff',
    'fc-secure-9befa92f-ef34-4fcf-8df5-d085656e26dd',
    'fc-secure-870d27c3-a758-4535-b8dd-5fc0514c5215',
    'fc-secure-980cd412-6b18-480a-b2f2-ad1543c06a91',
    'fc-secure-16e0c63c-847a-42ef-91ca-3523b3668357',
    'fc-secure-c53831f7-0431-44e5-abe6-308270690c3b',
    'fc-secure-51a26e99-63eb-442a-869d-87ecbc60c814',
    'fc-secure-e5676c90-7028-4b68-b620-c6944514d52c',
    'fc-secure-977aa72f-e9ce-4fb6-b32b-c675b4ef25d5',
    'fc-secure-d7a002ea-7e1e-45fd-8e76-456fce471f17',
    'fc-secure-6537d7f6-f29f-432b-b66e-8cf2204b7920',
    'fc-secure-b2669acd-7139-464f-af53-af7215c068aa',
    'fc-secure-cb3eeabf-f0ef-497e-9bc6-b5a27be4fec2',
    'fc-secure-2180b508-ce9d-4535-aa9f-f07d5917025c',
    'fc-secure-7e0893cd-4f31-41e4-b1d2-3e656097824a',
    'fc-secure-f8b9ce8d-efc0-4aa1-ad71-c0378d8d7194',
    'fc-secure-b2f4e185-a21a-434a-9494-d1fabaaaf7c0',
    'fc-secure-1355eb72-b00f-4796-8892-ac271b699503',
    'fc-secure-68b7e62f-132b-4818-bf64-6c38ec9152ab',
    'fc-secure-dae591de-00ad-478c-9440-88034a1b8cb9',
    'fc-secure-7e9fe869-643a-4828-a1b7-0245e34745ae',
    'fc-secure-228fd6fd-e0f7-4895-a246-3b055be27aa1',
    'fc-secure-e99706c4-48f9-4a69-baf4-70d1c5eaac5c',
    'fc-secure-d2c84e56-8f0d-420a-96a4-942e92009433',
    'fc-secure-589e3f7a-7b24-46cf-aefd-63b05155d826',
    'fc-secure-3fdbe020-6bdb-4668-bcb8-0d0df9d4ba8a',
    'fc-secure-b31156cd-4993-4f69-a8f4-9a99c2697965',
    'fc-secure-73036e74-c8b0-4e6f-9f4f-ca55b599d5d1',
    'fc-secure-3c4843c0-b83f-4ba1-9bba-9c9a599f3ffb',
    'fc-secure-91f9e579-b064-4992-8b00-c789ca48f861',
    'fc-secure-ac588f86-da2d-4a92-9f45-be2aeedd5fac',
    'fc-secure-2fa3df40-c189-41ee-b5ba-484a0b77ef77',
    'fc-secure-00737009-4e0f-454d-bb02-4b70566a0ed2',
    'fc-secure-36dfb67b-d2fc-47a1-a94c-225d72e08afd',
    'fc-secure-55efa443-810c-48c8-90bb-f07beba0e560',
    'fc-secure-43207dac-0905-4fdd-b816-a34bd2ccebdd',
    'fc-secure-fba19c6f-984e-4616-b253-6d9e6ea5cec5',
    'fc-secure-1614d6d2-d053-4de0-9b97-cc4b0762f547',
    'fc-secure-c40af798-8afc-4ab3-9b66-946955811d3b',
    'fc-secure-abc7f058-0260-4e82-a911-abfec3dcb676',
    'fc-secure-29cd113f-7eca-4526-aa52-dde1b8cb41d0',
    'fc-secure-877e6c8c-72ef-46d0-b3f3-37dd175771fe',
    'fc-secure-0eba3dae-89be-4642-8982-9a80a7428cd2',
    'fc-secure-0ca0c5e6-26ca-47ea-b509-ec4eaa058fc6',
    'fc-secure-bee7792c-ef35-478d-a9bb-c8f2054c335c',
    'fc-secure-72a949c5-0b7d-45c9-96c3-ff4d25815ed5',
]
results = []
for bucket in bucket_list:
    logging.info(f"Examining VDS files in bucket: {bucket}")
    status, total_vds_file_count, vds_file_in_jointcall_ws_count, diff = compare_vds_files(bucket) 
    results.append([bucket, status, total_vds_file_count, vds_file_in_jointcall_ws_count, diff])
    results_df = pd.DataFrame(results, columns = ["bucket", "status", "total_vds_file_count", "vds_file_in_jointcall_ws_count", "diff"])
logging.info("Results:")
display(results_df)

04/06/2024 12:19:36 AM - INFO: Examining VDS files in bucket: fc-secure-f5d884c0-a24c-46e6-8c29-cad7f5b158c7
04/06/2024 12:19:48 AM - INFO: Examining VDS files in bucket: fc-secure-6513d7e1-2dbb-41a2-baea-3f7fdbcbb620
04/06/2024 12:20:00 AM - INFO: Examining VDS files in bucket: fc-d3e9eb24-cb19-47d8-b2c6-d85fd34b4ff1
04/06/2024 12:20:12 AM - INFO: Examining VDS files in bucket: fc-0ed1ef2d-1039-4c8a-a0a9-91c3e385200a
04/06/2024 12:20:23 AM - INFO: Examining VDS files in bucket: fc-282a8e0b-df88-42de-9059-2b7447d9f9c7
04/06/2024 12:20:36 AM - INFO: Examining VDS files in bucket: fc-secure-5efb4966-0994-41f8-a911-1d159c9bae1b
04/06/2024 12:20:49 AM - INFO: Examining VDS files in bucket: fc-2836a560-113a-4239-acab-5cce58019b73
04/06/2024 12:21:00 AM - INFO: Examining VDS files in bucket: fc-bb71bb7a-fdb1-427a-9e56-eb08b6fd7955
04/06/2024 12:21:15 AM - INFO: Examining VDS files in bucket: fc-secure-e9b2e26a-3f73-4f5a-862f-c5b3be68703f
04/06/2024 12:21:27 AM - INFO: Examining VDS files in 

04/06/2024 12:34:40 AM - INFO: Examining VDS files in bucket: fc-secure-a473f80e-97a6-4c19-bd68-e37266efb44d
04/06/2024 12:34:52 AM - INFO: Examining VDS files in bucket: fc-secure-04e82709-08e0-4335-aaef-ba55089f6fd9
04/06/2024 12:35:03 AM - INFO: Examining VDS files in bucket: fc-secure-ed823158-2149-493f-80d0-ff066cb14a85
04/06/2024 12:35:14 AM - INFO: Examining VDS files in bucket: fc-secure-bcc5d428-aed0-4814-aefc-f717b97d5106
04/06/2024 12:35:26 AM - INFO: Examining VDS files in bucket: fc-secure-de72ef13-9b7f-44db-9428-5df489d327ce
04/06/2024 12:35:37 AM - INFO: Examining VDS files in bucket: fc-secure-35c81df8-8bdf-467a-af6f-fb807185b82e
04/06/2024 12:35:48 AM - INFO: Examining VDS files in bucket: fc-secure-39458ab6-c2d3-49e2-b6d5-8bb3bae9a245
04/06/2024 12:36:00 AM - INFO: Examining VDS files in bucket: fc-secure-124c02b9-69b7-468c-b3d7-4a07aee74dc5
04/06/2024 12:36:11 AM - INFO: Examining VDS files in bucket: fc-secure-a065288d-5bb4-441c-95e9-0ffb20a6cf40
04/06/2024 12:36:25

Unnamed: 0,bucket,status,total_vds_file_count,vds_file_in_jointcall_ws_count,diff
0,fc-secure-f5d884c0-a24c-46e6-8c29-cad7f5b158c7,Success,969335,969335,0
1,fc-secure-6513d7e1-2dbb-41a2-baea-3f7fdbcbb620,Success,43558,43558,0
2,fc-d3e9eb24-cb19-47d8-b2c6-d85fd34b4ff1,Success,43558,43558,0
3,fc-0ed1ef2d-1039-4c8a-a0a9-91c3e385200a,Success,43558,43558,0
4,fc-282a8e0b-df88-42de-9059-2b7447d9f9c7,Success,43558,43558,0
5,fc-secure-5efb4966-0994-41f8-a911-1d159c9bae1b,Success,43558,43558,0
6,fc-2836a560-113a-4239-acab-5cce58019b73,Success,43558,43558,0
7,fc-bb71bb7a-fdb1-427a-9e56-eb08b6fd7955,Success,43558,43558,0
8,fc-secure-e9b2e26a-3f73-4f5a-862f-c5b3be68703f,Success,43558,43558,0
9,fc-e7051891-25c8-4776-80ed-26b1af860277,Success,43558,43558,0


# Identify and resolve records missing part_of_dataset_id

## Script to identify whether datasets need to be patched

In [6]:
#############################################
## Functions
#############################################

def check_dataset_fk_field(dataset_id_list):
    
    # Loop through and process dataset IDs
    results = []
    for dataset_id in dataset_id_list:
    
        # Retrieve dataset information
        logging.info(f"Processing dataset_id = {dataset_id}...")
        api_client = utils.refresh_tdr_api_client()
        datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
        try:
            logging.info("Retrieving dataset details.")
            response = datasets_api.retrieve_dataset(id=dataset_id, include=["ACCESS_INFORMATION"]).to_dict()
            bq_project = response["access_information"]["big_query"]["project_id"]
            bq_dataset = response["access_information"]["big_query"]["dataset_name"]
        except Exception as e:
            error_message = f"Error retrieving dataset details: {str(e)}"
            logging.error(error_message)
            results.append([dataset_id, "All", "Failure", error_message])
            continue
            
        # Evaluate whether data needs to be reprocessed for the tables in question and reprocess if so 
        for table in ["anvil_donor", "anvil_biosample"]:
            
            # Evaluate whether a patch is needed
            logging.info(f"Checking whether patching is required for the {table} table.")
            patch_needed = False
            client = bigquery.Client()
            if table == "anvil_donor":
                query = """SELECT COUNT(*) AS null_cnt FROM `{project}.{dataset}.{src_table}` WHERE part_of_dataset_id IS NULL""".format(project=bq_project, dataset=bq_dataset, src_table = table)
            else:
                query = """SELECT COUNT(*) AS null_cnt FROM `{project}.{dataset}.{src_table}` WHERE ARRAY_LENGTH(part_of_dataset_id) = 0""".format(project=bq_project, dataset=bq_dataset, src_table = table)
            try:
                df = client.query(query).result().to_dataframe()
                if df["null_cnt"].values[0] > 0:
                    patch_needed = True
            except Exception as e:
                error_message = f"BigQuery error: {str(e)}"
                results.append([dataset_id, table, "Failure", error_message])
                continue
                
            # Patch dataset if needed
            if patch_needed:
                results.append([dataset_id, table, "Success", "Patch Needed"])
            else:
                results.append([dataset_id, table, "Success", "No Patch Needed"])
                
    # Display results
    logging.info("\nResults:")
    df_results = pd.DataFrame(results, columns =["Dataset ID", "Table", "Status", "Message"])
    display(df_results)


#############################################
## Input Parameters
#############################################

# List of dataset IDs to examine and patch if necessary
dataset_id_list = [
    'f9224ea2-dd31-421d-80d4-f35082ef8d68',
    'd7bcfc5d-e258-4bd6-a413-bb7a118e6bff',
    '6d18aafc-0240-499c-902e-a72a5b98ff0a',
]

#############################################
## Execution
#############################################

check_dataset_fk_field(dataset_id_list)


03/22/2024 06:30:26 PM - INFO: Processing dataset_id = f9224ea2-dd31-421d-80d4-f35082ef8d68...
03/22/2024 06:30:26 PM - INFO: Retrieving dataset details.
03/22/2024 06:30:26 PM - INFO: Checking whether patching is required for the anvil_donor table.
03/22/2024 06:30:29 PM - INFO: Checking whether patching is required for the anvil_biosample table.
03/22/2024 06:30:31 PM - INFO: Processing dataset_id = d7bcfc5d-e258-4bd6-a413-bb7a118e6bff...
03/22/2024 06:30:31 PM - INFO: Retrieving dataset details.
03/22/2024 06:30:32 PM - INFO: Checking whether patching is required for the anvil_donor table.
03/22/2024 06:30:34 PM - INFO: Checking whether patching is required for the anvil_biosample table.
03/22/2024 06:30:36 PM - INFO: Processing dataset_id = 6d18aafc-0240-499c-902e-a72a5b98ff0a...
03/22/2024 06:30:36 PM - INFO: Retrieving dataset details.
03/22/2024 06:30:36 PM - INFO: Checking whether patching is required for the anvil_donor table.
03/22/2024 06:30:38 PM - INFO: Checking whether pa

Unnamed: 0,Dataset ID,Table,Status,Message
0,f9224ea2-dd31-421d-80d4-f35082ef8d68,anvil_donor,Success,Patch Needed
1,f9224ea2-dd31-421d-80d4-f35082ef8d68,anvil_biosample,Success,No Patch Needed
2,d7bcfc5d-e258-4bd6-a413-bb7a118e6bff,anvil_donor,Success,Patch Needed
3,d7bcfc5d-e258-4bd6-a413-bb7a118e6bff,anvil_biosample,Success,No Patch Needed
4,6d18aafc-0240-499c-902e-a72a5b98ff0a,anvil_donor,Success,Patch Needed
5,6d18aafc-0240-499c-902e-a72a5b98ff0a,anvil_biosample,Success,No Patch Needed


## Script to patch dataset

In [2]:
#############################################
## Functions
#############################################

def check_and_patch_dataset_fk_field(dataset_id_list):
    
    # Loop through and process dataset IDs
    results = []
    for dataset_id in dataset_id_list:
    
        # Retrieve dataset information
        logging.info(f"Processing dataset_id = {dataset_id}...")
        api_client = utils.refresh_tdr_api_client()
        datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
        try:
            logging.info("Retrieving dataset details.")
            response = datasets_api.retrieve_dataset(id=dataset_id, include=["ACCESS_INFORMATION"]).to_dict()
            bq_project = response["access_information"]["big_query"]["project_id"]
            bq_dataset = response["access_information"]["big_query"]["dataset_name"]
        except Exception as e:
            error_message = f"Error retrieving dataset details: {str(e)}"
            logging.error(error_message)
            results.append([dataset_id, "All", "Failure", error_message])
            continue
            
        # Evaluate whether data needs to be reprocessed for the tables in question and reprocess if so 
        for table in ["anvil_donor", "anvil_biosample"]:
            
            # Evaluate whether a patch is needed
            logging.info(f"Checking whether patching is required for the {table} table.")
            patch_needed = False
            client = bigquery.Client()
            if table == "anvil_donor":
                query = """SELECT COUNT(*) AS null_cnt FROM `{project}.{dataset}.{src_table}` WHERE part_of_dataset_id IS NULL""".format(project=bq_project, dataset=bq_dataset, src_table = table)
            else:
                query = """SELECT COUNT(*) AS null_cnt FROM `{project}.{dataset}.{src_table}` WHERE ARRAY_LENGTH(part_of_dataset_id) = 0""".format(project=bq_project, dataset=bq_dataset, src_table = table)
            try:
                df = client.query(query).result().to_dataframe()
                if df["null_cnt"].values[0] > 0:
                    patch_needed = True
            except Exception as e:
                error_message = f"BigQuery error: {str(e)}"
                results.append([dataset_id, table, "Failure", error_message])
                continue
                
            # Patch dataset if needed
            if patch_needed:
                logging.info(f"Patching {table} table.")
                
                # Reprocess table to populate missing values
                client = bigquery.Client()
                target_file = f"{table}.json"
                destination_dir = f"ingest_pipeline/output/transformed/anvil/{dataset_id}/table_data"
                if table == "anvil_donor":
                    query = """SELECT * EXCEPT(part_of_dataset_id), MAX(part_of_dataset_id) OVER (ORDER BY part_of_dataset_id DESC NULLS LAST) AS part_of_dataset_id 
                                FROM `{project}.{dataset}.{src_table}`""".format(project=bq_project, dataset=bq_dataset, src_table = table)
                else:
                    query = """WITH dataset_id
                                AS
                                (
                                  SELECT MAX(ARRAY_TO_STRING(part_of_dataset_id, "")) AS id
                                  FROM `{project}.{dataset}.{src_table}` 
                                )
                                SELECT * EXCEPT(part_of_dataset_id), [(SELECT MAX(id) FROM dataset_id)] AS part_of_dataset_id
                                FROM `{project}.{dataset}.{src_table}`""".format(project=bq_project, dataset=bq_dataset, src_table = table)
                logging.info("Creating updated table data.")
                try:
                    df = client.query(query).result().to_dataframe()
                    records_json = df.to_json(orient='records') 
                    records_list = json.loads(records_json)
                    records_cnt = len(records_list)
                    with open(target_file, 'w') as outfile:
                        for idx, val in enumerate(records_list):
                            json.dump(val, outfile)
                            if idx < (records_cnt - 1):
                                outfile.write('\n')
                    !gsutil cp $target_file $ws_bucket/$destination_dir/ 2> stdout
                    !rm $target_file
                    logging.info(f"Successfully created new {table}.json file.")
                except Exception as e:
                    error_message = f"Error creating new json file. Exiting function. Error: {str(e)}"
                    logging.error(error_message)
                    results.append([dataset_id, table, "Failure", error_message])
                    continue
            
                # Ingest updated anvil_donor data
                logging.info("Submitting ingest request for updated data.")
                source_full_file_path = "{}/{}/{}".format(ws_bucket, destination_dir, target_file)
                ingest_request = {
                    "table": table,
                    "profile_id": "e0e03e48-5b96-45ec-baa4-8cc1ebf74c61",
                    "ignore_unknown_values": True,
                    "resolve_existing_files": True,
                    "updateStrategy": "replace",
                    "format": "json",
                    "load_tag": "Ingest for {}".format(dataset_id),
                    "path": source_full_file_path
                }
                attempt_counter = 0
                while True:
                    try:
                        api_client = utils.refresh_tdr_api_client()
                        datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
                        ingest_request_result, job_id = utils.wait_for_tdr_job(datasets_api.ingest_dataset(id=dataset_id, ingest=ingest_request))
                        logging.info("Ingest succeeded: {}".format(str(ingest_request_result)[0:1000]))
                        results.append([dataset_id, table, "Success", "Records Patched"])
                        break
                    except Exception as e:
                        logging.error("Error on Dataset Ingest: {}".format(str(e)))
                        attempt_counter += 1
                        if attempt_counter < 2:
                            logging.info("Retrying Dataset Ingest (attempt #{})...".format(str(attempt_counter)))
                            sleep(10)
                            continue
                        else:
                            logging.error("Maximum number of retries exceeded. Exiting function.")
                            results.append([dataset_id, table, "Failure", str(e)])
                            break
            else:
                logging.info("No patching required!")
                results.append([dataset_id, table, "Success", "No Patch Needed"])
                
    # Display results
    logging.info("\nResults:")
    df_results = pd.DataFrame(results, columns =["Dataset ID", "Table", "Status", "Message"])
    display(df_results)


#############################################
## Input Parameters
#############################################

# List of dataset IDs to examine and patch if necessary
dataset_id_list = [
    'f9224ea2-dd31-421d-80d4-f35082ef8d68',
]

#############################################
## Execution
#############################################

check_and_patch_dataset_fk_field(dataset_id_list)


03/22/2024 03:32:05 PM - INFO: Processing dataset_id = f9224ea2-dd31-421d-80d4-f35082ef8d68...
03/22/2024 03:32:05 PM - INFO: Retrieving dataset details.
03/22/2024 03:32:06 PM - INFO: Checking whether patching is required for the anvil_donor table.
03/22/2024 03:32:09 PM - INFO: Patching anvil_donor table.
03/22/2024 03:32:09 PM - INFO: Creating updated table data.
03/22/2024 03:32:24 PM - INFO: Successfully created new anvil_donor.json file.
03/22/2024 03:32:24 PM - INFO: Submitting ingest request for updated data.
TDR Job ID: O_kzkIQwRfq5hAh6y9V0bg
03/22/2024 03:32:54 PM - INFO: Ingest succeeded: {'dataset_id': 'f9224ea2-dd31-421d-80d4-f35082ef8d68', 'dataset': 'ANVIL_ALSCompute_Collection_GRU_20231016', 'table': 'anvil_donor', 'path': 'gs://fc-2a9eefc3-0302-427f-9ac3-82f078741c03/ingest_pipeline/output/transformed/anvil/f9224ea2-dd31-421d-80d4-f35082ef8d68/table_data/anvil_donor.json', 'load_tag': 'Ingest for f9224ea2-dd31-421d-80d4-f35082ef8d68', 'row_count': 7454, 'bad_row_count'

Unnamed: 0,Dataset ID,Table,Status,Message
0,f9224ea2-dd31-421d-80d4-f35082ef8d68,anvil_donor,Success,Records Patched
1,f9224ea2-dd31-421d-80d4-f35082ef8d68,anvil_biosample,Success,No Patch Needed


# Patch Dataset Properties

In [None]:
# Dataset-workspace list
dataset_ws_list = [
    ['9a32e23e-840d-4ba3-8cd9-392f48b8e9d2', 'AnVIL_CCDG_Baylor_CVD_HemStroke_GOCHA_DS_WGS'],
    ['5069fc2c-b957-4130-adca-6eabae943867', 'AnVIL_CCDG_Baylor_CVD_HemStroke_WashU_DS_WGS'],
    ['1939b7ae-fc6b-42a8-ad5f-dc51a1682a17', 'AnVIL_CCDG_Broad_CVD_AF_Darbar_UIC_Cases_Arrays'],
    ['4e99b8e1-40b9-4fb2-90a0-d85e926ef31e', 'AnVIL_CCDG_Broad_CVD_AF_Darbar_UIC_Cases_WES'],
    ['2cda53ba-b852-47e8-8f24-59ab8e9f1d1f', 'AnVIL_CCDG_Broad_CVD_AF_Darbar_UIC_Controls_Arrays'],
    ['128332b6-5060-4ec4-b6a6-f53b54a810be', 'AnVIL_CCDG_Broad_CVD_AF_Darbar_UIC_Controls_WES'],
    ['06f05f58-3c83-4f5c-bddd-bed7d2d1d147', 'AnVIL_CCDG_Broad_CVD_AF_EAST_WES'],
    ['41cb9f29-4ba6-4690-821c-cb085e6b0f2f', 'AnVIL_CCDG_Broad_CVD_AF_Figtree_BioHeart_WES'],
    ['9d796a02-e2aa-4c15-b8d6-1e90cd736681', 'AnVIL_CCDG_Broad_CVD_AF_Natale_TCAI_Arrays'],
    ['7ea006d9-1e19-4678-b2e6-d4a1ea327f74', 'AnVIL_CCDG_Broad_CVD_AF_Natale_TCAI_WES'],
    ['433e3a09-661a-46a5-96f2-dbb07bdc87f3', 'AnVIL_CCDG_Broad_CVD_AF_Olesen_Arrays'],
    ['34fd3b22-ac73-47d2-8849-5877158ec072', 'AnVIL_CCDG_Broad_CVD_AF_Olesen_WES'],
    ['a08dc7a6-f8ce-4205-95d2-83f614c2c32f', 'AnVIL_CCDG_Broad_CVD_AF_PEGASUS_HMB'],
    ['7ce3270e-b2f2-47f4-a288-639751b2f87f', 'AnVIL_CCDG_Broad_CVD_AF_Roberts_UWO_WES'],
    ['fcb03f4f-e685-4803-aadb-0e8940ff4f37', 'AnVIL_CCDG_Broad_CVD_AF_TMDU_Cases_Arrays'],
    ['41d12dc1-8718-4439-b409-26cc23573107', 'AnVIL_CCDG_Broad_CVD_AF_TMDU_Cases_WES'],
    ['c2f0e7cf-ac07-48f7-b5f1-497ee6c134b2', 'AnVIL_CCDG_Broad_CVD_AF_TMDU_Controls_Arrays'],
    ['c4c49fcd-0c20-4cff-841a-cb58f5689c5b', 'AnVIL_CCDG_Broad_CVD_AF_TMDU_Controls_WES'],
    ['9ee2a552-89f8-4a48-9c94-9fa26ebb7483', 'AnVIL_CCDG_Broad_CVD_AFib_Duke_WGS'],
    ['425412ba-894a-4824-acb8-bf18fe4576e0', 'AnVIL_CCDG_Broad_CVD_AFib_GENAF_WGS'],
    ['f22bd762-5c45-453e-bf22-b174514abb84', 'AnVIL_CCDG_Broad_CVD_AFib_Intermountain_WGS'],
    ['0ee62643-b064-42f8-9b09-5d10eacd70a3', 'AnVIL_CCDG_Broad_CVD_AFib_JHU_WGS'],
    ['c37b388c-7107-43d6-bee6-4e82b40ed271', 'AnVIL_CCDG_Broad_CVD_AFib_MPP_WGS'],
    ['bf6f1d78-6a0d-4afb-aea6-17a3c34340db', 'AnVIL_CCDG_Broad_CVD_AFib_Penn_WGS'],
    ['719f7581-21db-4aec-8c46-4a5811832710', 'AnVIL_CCDG_Broad_CVD_EOCAD_PROMIS_WGS'],
    ['15be288e-53e1-41cb-8d20-8ea87efb9258', 'AnVIL_CCDG_Broad_MI_ATVB_DS_CVD_WES'],
    ['8b8185d3-ba5c-4832-af23-3ff8ca6ed016', 'AnVIL_CCDG_Broad_MI_UnivUtah_DS_CVD_WES'],
    ['140797da-dc94-4fc2-8b0b-f2e1dec7bd43', 'AnVIL_CCDG_Broad_NP_Autism_State-Sanders_WGS'],
    ['8de6dae2-55ff-4287-9b75-5b2a950c1f44', 'AnVIL_CCDG_Broad_NP_Epilepsy_AUSALF_HMB_IRB_GSRS_GSA-MD'],
    ['d3ed2595-b8be-40c8-b7b6-10a4997b9d2e', 'AnVIL_CCDG_Broad_NP_Epilepsy_AUSRMB_DS-EAED-MDS-NPU-IRB_GSA-MD'],
    ['61803dc8-f649-43e5-ab15-d351f2cef629', 'AnVIL_CCDG_Broad_NP_Epilepsy_AUTMUV_DS_NS_MDS_NPU_GSA-MD'],
    ['abe58d43-e1c7-4953-aa41-4d3b6f6cca44', 'AnVIL_CCDG_Broad_NP_Epilepsy_AUTMUV_DS_NS_NPU_ADLT_GSA-MD'],
    ['395da421-e6e8-4a26-ac93-eb7050a7cb1f', 'AnVIL_CCDG_Broad_NP_Epilepsy_GHAKNT_GRU_GSA-MD'],
    ['615f6246-1c39-4e44-a9d4-c7133a2ae62d', 'AnVIL_CCDG_Broad_NP_Epilepsy_HKOSB_GRU_GSA-MD'],
    ['21384132-1697-4e9b-b863-a6492d13285d', 'AnVIL_CCDG_Broad_NP_Epilepsy_KENKIL_GRU_GSA-MD'],
    ['b7fb531e-25a4-427c-9679-b7bdc3d03535', 'AnVIL_CCDG_Broad_NP_Epilepsy_TWNCGM_HMB-NPU-ADULTS_WES'],
    ['608d793e-a78b-4872-a50c-21a9eaa60ec3', 'AnVIL_CCDG_Broad_NP_Epilepsy_USACCF_HMB-MDS_GSA-MD'],
    ['af867604-d801-41cc-9949-017eb30a0cbf', 'AnVIL_CCDG_Broad_NP_Epilepsy_USALCH_HMB_MDS_GSA-MD'],
    ['722e332c-fb1a-45fe-80c7-cc670f025b7f', 'AnVIL_CCDG_Broad_NP_Epilepsy_USAMGH_HMB_MDS_GSA-MD'],
    ['1d140c76-a06b-42a0-bae8-b9e169ebe394', 'AnVIL_CCDG_Broad_NP_Epilepsy_USAMON_HMB_NPU_MDS_GSA-MD'],
    ['3615e063-f24b-47f7-87cb-430e8aca8d0c', 'AnVIL_CCDG_Broad_NP_Epilepsy_USAUPN_GRU_NPU_WES'],
    ['e642bca0-52fb-4ab3-ab3a-acaab83deda7', 'AnVIL_CCDG_Broad_NP_Epilepsy_USAUPN_GRU_WES'],
    ['9ecc231f-e3d3-4417-a98a-c4db4c638161', 'AnVIL_CCDG_Broad_NP_Epilepsy_USAVANcontrols_HMB-GSO_WES'],
    ['c911503c-f010-4c17-ac57-1d82e954bdc7', 'AnVIL_CCDG_Broad_NP_Epilepsy_ZAFAGN_DS-EPI-COMO-MDS_GSA-MD'],
    ['3fb2d04a-d18b-4bdc-9372-99b992f2ae42', 'AnVIL_CCDG_Broad_NP_Epilepsy_ZAFAGN_DS-EPI-COMO-MDS_WES'],
    ['a3ae33bb-8b3a-47e5-a2d1-a49c954776b3', 'AnVIL_CCDG_NYGC_NP_Autism_HMCA_WGS'],
    ['0e65b131-fd14-4fce-908b-c5b89a71a9c1', 'AnVIL_CCDG_NYGC_NP_Autism_TASC_WGS'],
    ['d56ae233-d6d2-483c-917e-1de0fe1cfeb7', 'AnVIL_CCDG_TOPMED_Broad_CVD_EOCAD_PROMIS_WGS'],
    ['655e6a61-5400-4d8a-95bc-1506e026b289', 'AnVIL_CCDG_WashU_AI_T1D_T1DGC_WGS'],
    ['1f2d14d4-1bd8-46fc-9d35-1a415e5f326a', 'AnVIL_CCDG_WashU_CVD-NP-AI_Controls_VCControls_WGS'],
    ['64fd39fc-b32e-4b0a-8f83-4bf11b197462', 'AnVIL_CCDG_WashU_CVD_Brazil-CVD_WGS'],
    ['158ebecd-4596-4541-b832-a137232b7036', 'AnVIL_CCDG_WashU_CVD_EOCAD_BioMe_WGS'],
    ['1ccb95c3-1901-428e-b7bb-34495f41f4d2', 'AnVIL_CCDG_WashU_CVD_EOCAD_BioVu_WGS'],
    ['02ff1051-cd1d-4bbb-a005-21384cbff846', 'AnVIL_CCDG_WashU_CVD_EOCAD_Cleveland_WGS'],
    ['0144b0d3-a809-46df-8c67-7ce42bdd579a', 'AnVIL_CCDG_WashU_CVD_EOCAD_Duke_WGS'],
    ['35a1009d-93a2-49b1-a801-fe84d6b7a2f5', 'AnVIL_CCDG_WashU_CVD_EOCAD_Emerge_WGS'],
    ['50132478-c9fb-4dc5-86cd-d5dfab909393', 'AnVIL_CCDG_WashU_CVD_EOCAD_Emory_WGS'],
    ['35064fc1-6c52-4005-8e99-cb0d6afd3f8c', 'AnVIL_CCDG_WashU_CVD_EOCAD_Finland-CHD_WGS'],
    ['62cfdce6-2d4d-415c-a11e-5ab60131c668', 'AnVIL_CCDG_WashU_CVD_EOCAD_METSIM_WGS'],
    ['c5c0893f-b254-4038-8d08-b28ef5a26b5d', 'AnVIL_CMG_Broad_Brain_Engle_WGS'],
    ['b60876c5-d825-4303-befb-ffff55b92aba', 'AnVIL_CMG_Broad_Heart_Ware_WES'],
]

# Loop through and process
for entry in dataset_ws_list:
    # Pull dataset details
    dataset_id = entry[0]
    workspace_name = entry[1]
    logging.info(f"Processing dataset_id = {dataset_id}...")
    api_client = utils.refresh_tdr_api_client()
    datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
    try:
        logging.info("Retrieving dataset details.")
        dataset_details = datasets_api.retrieve_dataset(id=dataset_id, include=["PROPERTIES"]).to_dict()
        current_properties = dataset_details["properties"]
    except Exception as e:
        logging.error(f"Error retrieving dataset details: {str(e)}")
    
    # Update current propertie and patch dataset
    if current_properties["source_workspaces"] != workspace_name:
        current_properties["source_workspaces"] = [workspace_name]
        try:
            logging.info("Patching dataset.")
            resp = datasets_api.patch_dataset(id=dataset_id, dataset_patch_request_model={"properties": current_properties})
        except Exception as e:
            logging.error("Error on Dataset Patch: {}".format(str(e)))

