# Imports

In [None]:
# Install additional modules (one time effort per cloud environment)
!pip install --upgrade import_ipynb data_repo_client urllib3 xmltodict

In [None]:
## imports and environment variables

# Imports
import import_ipynb
import pandas as pd
import os
import re
import json
import data_repo_client
from google.cloud import bigquery
import ingest_pipeline_utilities as utils
import build_file_inventory as bfi
import identify_supplementary_files as isf
import logging
from time import sleep
import datetime
from google.cloud import storage
import math

# Configure logging format
logging.basicConfig(format="%(asctime)s - %(levelname)s: %(message)s", datefmt="%m/%d/%Y %I:%M:%S %p", level=logging.INFO)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option("display.max_colwidth", None)
pd.set_option('display.width', 1000)
pd.set_option('display.colheader_justify', 'center')
pd.set_option('display.precision', 3)

# Environment variables
ws_name = os.environ["WORKSPACE_NAME"]
ws_project = os.environ["WORKSPACE_NAMESPACE"]
ws_bucket = os.environ["WORKSPACE_BUCKET"]
ws_bucket_name = re.sub('^gs://', '', ws_bucket)


# Create new snapshot

## Script to create new full view snapshot

In [None]:
# Parameters
params = {}
params["profile_id"] = "e0e03e48-5b96-45ec-baa4-8cc1ebf74c61"
params["snapshot_readers_list"] = ["azul-anvil-prod@firecloud.org", "auth-domain"]
params["anvil_schema_version"] = "ANV5"

# Loop through datasets and create new snapshot
dataset_id_run_list = [
'8fbfea50-6a71-4b19-98e9-f95e3a8594c7',
'5627cdbb-22a0-436f-a7a4-34d7ce21bb45',
'6dca0ce9-37b3-4b0a-93bd-7d3f21b0edf3',
'8b2b1c92-66cf-403c-8eb0-03b523d1550e',
'595b6755-e7ae-4e83-af2e-693c089aeec3',
'416b8daa-9537-46db-ae7b-3f5ff5f01dc3',
'84ac0d05-4be5-43e9-973e-ef999144d802',
'732eaae3-b509-4a7a-8961-09d861e55253',
'c6f3bd64-ea67-488f-904f-f0bdf6320b5c',
'544f643d-b19f-4aa0-a6ec-a90e1a8681d6',
'd239dd7b-8d10-4960-aa91-8f8ede641e25',
'2d434f2c-6aaa-46b2-ada9-de4b887e13d3',
'c1644d4e-06e2-4fa8-95f1-5c1da5831257',
'f85ea65e-1943-4bd6-a541-71c5d8465ca9',
'2cbe079d-e7ab-47d8-836e-454a71440297',
'280c5d6f-39a3-4d1d-aad2-a174451cd9b2',
'488a38ee-f996-482d-a562-a4474f5594de',
'79c58bfb-3188-442b-9166-a50f28fcfae5',
'28e73469-12d4-493b-bf6f-83359c1f69c5',
'c2fd0797-ca41-49a1-b485-a4bedac00613',
'51daecbd-37fa-4a58-8625-b6fad65acf27',
'5afc14bf-d7ca-4a62-b7aa-5104fa846888',
'69178fa1-87d4-4ecc-bc0e-7347c3678635',
'15ae6390-6f6d-4fd8-9a51-ecf988676c4d',
'3a3100bb-369e-47c1-a77c-2cacb7cf020d',
'95788aa7-c897-4ae8-9166-4b8fc1fc5342',
]
results = []
for dataset in dataset_id_run_list:
    dataset_id = dataset
    try:
        api_client = utils.refresh_tdr_api_client()
        datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
        dataset_info = datasets_api.retrieve_dataset(id=dataset_id, include=["SCHEMA", "ACCESS_INFORMATION", "PROPERTIES"]).to_dict()
        dataset_name = dataset_info["name"]
        phs_id = dataset_info["phs_id"]
        consent_name = dataset_info["properties"]["consent_name"]
        auth_domains = dataset_info["properties"]["auth_domains"]
        src_workspaces = dataset_info["properties"]["source_workspaces"]
    except:
        dataset_name = ""
    if dataset_name:
        params["ws_bucket"] = ws_bucket
        params["dataset_id"] = dataset_id
        params["dataset_name"] = dataset_name
        params["phs_id"] = phs_id
        params["consent_name"] = consent_name
        params["auth_domains"] = auth_domains
        params["pipeline_results"] = []
        current_datetime = datetime.datetime.now()
        current_datetime_string = current_datetime.strftime("%Y%m%d%H%M")
        params["snapshot_name"] = params["dataset_name"] + "_" + params["anvil_schema_version"] + "_" + current_datetime_string 
        utils.create_and_share_snapshot(params)
        int_df_results = pd.DataFrame(params["pipeline_results"], columns = ["Dataset", "Time", "Step", "Task", "Status", "Message"])
        errors = int_df_results[int_df_results["Status"].str.contains("Error")]
        if len(errors) > 0:
            results.append([dataset_id, "Error", ""])
        else:
            snapshot_id = re.search("{'id': '([a-z0-9]{8}-[a-z0-9]{4}-[a-z0-9]{4}-[a-z0-9]{4}-[a-z0-9]{12})'", str(int_df_results[int_df_results["Task"]=="Create and Share Snapshot"]["Message"]))[1]
            results.append([dataset_id, "Success", snapshot_id])
results_df = pd.DataFrame(results, columns = ["dataset_id", "run_status", "snapshot_id"])
display(results_df)


## Verify Snapshots Have Properly Formatted DRS URI

In [None]:
def validate_snapshot_drs_format(snapshot_id):
    
    # Retrieve snapshot information
    api_client = utils.refresh_tdr_api_client()
    snapshots_api = data_repo_client.SnapshotsApi(api_client=api_client)
    try:
        response = snapshots_api.retrieve_snapshot(id=snapshot_id, include=["ACCESS_INFORMATION"]).to_dict()
        bq_project = response["access_information"]["big_query"]["project_id"]
        bq_dataset = response["access_information"]["big_query"]["dataset_name"]
    except Exception as e:
        return "Failure - Issue Retrieving Snapshot Info"
    
    # Determine if field exists for dataset, continue if so, fail otherwise
    client = bigquery.Client()
    query = """SELECT COUNT(file_ref) AS rec_cnt, COUNT(CASE WHEN file_ref LIKE '%drs://drs.anv0:v2_%' THEN file_ref END) AS valid_cnt
                FROM `{project}.{dataset}.anvil_file`""".format(project=bq_project, dataset=bq_dataset)
    try:
        df = client.query(query).result().to_dataframe()
        if df["rec_cnt"].values[0] == df["valid_cnt"].values[0]:
            return "Success"
        else:
            rec_cnt = df["rec_cnt"].values[0]
            valid_cnt = df["valid_cnt"].values[0]
            return f"Failure: Only {valid_cnt} of {rec_cnt} records properly formatted"
    except Exception as e:
        return "Failure - BigQuery Error"

# Loop through datasets and validate is_supplementary field
snapshot_id_list = [
'c3d22305-b3f2-4561-a5b9-bed82ee742f4',
'9fe2abd4-70b4-4eee-b00d-38726ced8620',
'5329c25e-ccad-435d-9250-6fcc3ff88472',
'ced601b2-9a11-40e9-8067-241e5a5996ed',
'8165245c-2003-4ec7-bf57-731959022d47',
'737d454c-88be-477f-ae2c-ef473e2106ce',
'3bdbad9e-f9d4-4442-8606-791d490bf0af',
'cd19195f-25a0-44b1-b47d-ec99141833fc',
'b897e519-ba8b-4758-a263-6d57bd3b8e2b',
'1d385cfc-4bed-4f52-8f7b-ea54fc44b4f7',
'02d25240-823f-4b1d-8562-95385716a453',
'1974a21b-c409-4736-a3d7-e195fa96c4eb',
'99b46287-4790-492c-8a12-bea33f0f927c',
'c6ef5822-3929-4ae7-b5bc-dc27528bf226',
'08d19a7e-b868-4766-9f7e-d879d972cbd7',
'35186e6d-2728-4a8e-b0ad-6b34d0fe480c',
'b0d176bf-d094-4e33-a34b-b83a94de86ea',
'cc6bacc8-29fa-4d97-8856-79f52ea50c6f',
'85b721da-ad8e-4d82-93f0-0988f94af22e',
'407c7800-3ab4-4b13-ba45-c6c13c1c2278',
'2529f127-cff5-43ff-b879-06bc0e3468ff',
'b511be0b-7dc5-4767-a891-37f43d04a5a5',
'a7e031c3-62d4-46db-b2e2-0bdca6bbad65',
'5bba97dc-d6ab-4329-912f-148c8b807056',
'9cf61d88-d096-4981-b0c6-99db77554c01',
'4c722626-c559-4f5a-84bd-8d7d46983e1e',
'7c237e08-3329-4e64-bd2a-063be290e78b',
'4117144f-92e7-454f-9263-dad5e128cadb',
'ce2e7235-26e6-470f-8e05-298193b7f53d',
'6df525e1-b143-4e6f-b667-80c783ae1b66',
'92666b7c-4d50-4530-88e9-ea2d3da9d07a',
'42644c25-fa23-4b4e-8fcc-907cd8dcef60',
'155c11a9-638a-45c8-b172-7cf2e3e16fe6',
'b3da9fec-08ad-4496-a9ac-1411388fb5cc',
'0de07296-e3ff-4fe6-9183-9f421484197c',
'1b6273c6-7769-4daf-abee-93b11b322c73',
'ea50255a-45a4-4846-82e3-02b4f46f5b17',
'eb7045e1-2286-49f1-bce6-21b5d7fa5c32',
'b763c288-4132-434a-a6c9-25ad51b9d961',
'b67702a8-307d-4b20-835e-c0245d0761e5',
'88548251-e59e-4bc3-b71a-f1e9e2369919',
'd3dc5627-503b-48a5-ad79-31ab6c2fd417',
'ec14f8cd-5b1b-4124-a235-f11159984c7c',
'6d9e1212-4fa6-4632-be8a-75c45a474dd3',
'667eac9b-4e90-413d-80f3-d857b9829ab7',
'c091ea30-1862-4b1f-8e92-087b441472c3',
'43c86818-9bfe-46f2-9ae4-4a55a7baef1f',
'ebdaca04-ef29-42f3-8486-a94dade81bf8',
'f8781fbf-5fef-4481-8819-3df1bc724b7f',
'830df9ed-e4a6-4c9a-a97a-aa080fb030e4',
'84703c54-a9dd-400c-9701-2fc40922e3e3',
'c1c674dd-056a-470c-8874-bf70d8fae3a8',
'6a5b3be6-d1de-4f23-a431-b08e7ab231b8',
'ffe34538-3ddd-48de-b4a2-94f9b2dad086',
'2c6de04e-104d-42c8-8448-97d74985dacb',
'2a1882d9-88ca-4849-bcc1-f6914f593407',
'bf2f4106-cee9-419c-b4d1-d7b03a6293d5',
'a6c36f5e-b86c-4164-85ae-8bf0df2e4a90',
'7c19d852-e36a-4353-afea-10e501601d9a',
'00297802-e20a-413f-b389-a6f764b6600e',
'b8a455eb-827d-43a0-a89b-5d017747140f',
'3e85b06a-a6ea-4ce8-a655-44b1fce12138',
'9321b908-f2e4-437b-b53e-ed81754dcace',
'172bada7-f1c5-41c4-836d-05381beaed9a',
'133e902c-5ff0-4119-8078-db3e15006844',
'452bcafd-ab45-4e24-b5e0-13fcf22b0755',
'5e547934-c339-410e-a013-dfefed50f4b8',
'ffa84feb-ca0e-43d3-a04d-a402a8e24a3b',
'ff27037e-cb52-44ef-8979-f6e7ac3ed6f6',
'c853d4c0-d4be-433d-964e-e30bdc35480e',
'8fbe2def-b8ad-4b2d-90c9-0dd4517c67e1',
'03e54581-8fd3-47c3-9143-55368d2e4e86',
'9efae3c7-904c-48a8-939a-e82b46005ae1',
'5955a235-5be6-47bc-8303-ed0c4e68f501',
'e04edfef-69f8-47ff-8df9-dfff0e9218d2',
'f2a7be5a-4f7a-4a96-935e-ca7592855b45',
'7c90289b-be3e-4c9b-917a-d5e27d95dc15',
'0f46a588-b4ff-4a69-99e9-0a0bcf052522',
'cdd689fd-10f3-4cfa-b738-46549e689cac',
'eb7948be-1007-4b0e-b9b6-a5c40bbb9596',
'f20753f0-d09a-4b47-bffe-8f24ec354761',
'4cff04f4-eff9-4a62-bc6e-691accfbd328',
'9a61b980-4a33-465a-bc50-1aba00bc2cf6',
'90fe2016-e79c-456c-a5f9-3a31149fcd65',
'a4c62d7f-34f0-4e2e-9e46-c762d3ab0ff2',
'28dc8121-5e55-46c2-8313-681de2298986',
'dcc578ed-44bb-458f-8ff5-a78ca83f4616',
'aa42debe-3747-4dcd-8bc9-24eb90673fa5',
'5208772d-21f9-46b0-8167-0b05b57296b8',
'a2da748b-fec8-4e10-88ee-de32cbe8dee1',
'26df2a34-b10d-4361-ba2b-d9f966d09f61',
'dd00a8ba-ac49-481b-8d79-0e440adafd77',
'0df983d7-ed5e-44d2-acf1-686822b0cc7e',
'28559e94-ed57-48c8-bc8b-6cc4ad659a61',
'8b385bd3-52aa-48b9-be33-41f4d3fd4531',
'ce1bf5c3-525e-455d-a1e9-dd5f3d68c9d3',
'd0a6aa4c-821c-4bba-b53b-4f230ca3cda4',
'd9e817a2-6657-433b-8b2f-73790561725c',
'33c854eb-d228-4a82-8324-5e455ed1e447',
]
results = []
for snapshot_id in snapshot_id_list:
    status = validate_snapshot_drs_format(snapshot_id) 
    results.append([snapshot_id, status])
    results_df = pd.DataFrame(results, columns = ["snapshot_id", "validation_status"])
display(results_df)

# Add and populate anvil_file.is_supplementary

## Script to patch dataset

In [None]:
# Set base parameters
params = {}
params["profile_id"] = "e0e03e48-5b96-45ec-baa4-8cc1ebf74c61"

# Loop through datasets and process is_supplementary field
dataset_id_list = [
'8b2b1c92-66cf-403c-8eb0-03b523d1550e',
'595b6755-e7ae-4e83-af2e-693c089aeec3',
'84ac0d05-4be5-43e9-973e-ef999144d802',
'732eaae3-b509-4a7a-8961-09d861e55253',
'544f643d-b19f-4aa0-a6ec-a90e1a8681d6',
'f85ea65e-1943-4bd6-a541-71c5d8465ca9',
'280c5d6f-39a3-4d1d-aad2-a174451cd9b2',
]
results = []
for dataset_id in dataset_id_list:
    logging.info(f"Patching dataset_id: {dataset_id}")
    params["t_output_dir"] = "ingest_pipeline/output/transformed/anvil/{}/table_data".format(dataset_id)
    output, status = isf.identify_supplementary_files(params, dataset_id)
    results.append([dataset_id, status, output])
    results_df = pd.DataFrame(results, columns = ["dataset_id", "run_status", "output"])
display(results_df)


## Script to validate patch worked properly

In [None]:
def validate_supp_file_flg(dataset_id):
    
    # Retrieve dataset information
    src_schema_dict = {}
    api_client = utils.refresh_tdr_api_client()
    datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
    try:
        response = datasets_api.retrieve_dataset(id=dataset_id, include=["SCHEMA", "ACCESS_INFORMATION"]).to_dict()
        src_schema_dict["tables"] = response["schema"]["tables"]
        bq_project = response["access_information"]["big_query"]["project_id"]
        bq_dataset = response["access_information"]["big_query"]["dataset_name"]
    except Exception as e:
        return "Failure - Issue Retrieving Dataset Info"
    
    # Determine if field exists for dataset, continue if so, fail otherwise
    field_found = False
    for table in src_schema_dict["tables"]:
        if table["name"] == "anvil_file":
            for col in table["columns"]:
                if col["name"] == "is_supplementary":
                    field_found = True
                    break
            break
    if field_found == False:
        return "Failure - is_supplementary field not found"
    else:
        client = bigquery.Client()
        # Check field population
        query = """SELECT COUNT(*) AS rec_cnt, COUNT(is_supplementary) AS populated_cnt
                    FROM `{project}.{dataset}.anvil_file`""".format(project=bq_project, dataset=bq_dataset)
        try:
            df = client.query(query).result().to_dataframe()
            if df["rec_cnt"].values[0] == df["populated_cnt"].values[0]:
                pass
            else:
                return "Failure - is_supplementary field not populated"
        except Exception as e:
            return "Failure - BigQuery Error"
        # Check field logic
        validation_query = """
            WITH activity_agg
            AS
            (
              SELECT used_biosample_id, generated_file_id, used_file_id FROM `{project}.{dataset}.anvil_activity`
              UNION ALL 
              SELECT [] AS used_biosample_id, generated_file_id, used_file_id FROM `{project}.{dataset}.anvil_alignmentactivity`
              UNION ALL 
              SELECT used_biosample_id, generated_file_id, [] AS used_file_id FROM `{project}.{dataset}.anvil_assayactivity`
              UNION ALL 
              SELECT used_biosample_id, generated_file_id, [] AS used_file_id FROM `{project}.{dataset}.anvil_sequencingactivity`
              UNION ALL 
              SELECT [] AS used_biosample_id, generated_file_id, used_file_id FROM `{project}.{dataset}.anvil_variantcallingactivity`
            ),
            activity_exp 
            AS
            (
              SELECT file_id, int_file_id, biosample_id
              FROM activity_agg
                  LEFT JOIN UNNEST(used_biosample_id) AS biosample_id
                  LEFT JOIN UNNEST(generated_file_id) as file_id
                  LEFT JOIN UNNEST(used_file_id) as int_file_id
            ),
            activity_exp_tagged
            AS
            (
              SELECT a.file_id, b.is_supplementary AS file_id_supp, int_file_id, c.is_supplementary AS int_file_id_supp, biosample_id
              FROM activity_exp a
                  LEFT JOIN  `{project}.{dataset}.anvil_file` b
                  ON a.file_id = b.file_id
                  LEFT JOIN  `{project}.{dataset}.anvil_file` c
                  ON a.int_file_id = c.file_id 
            )
            SELECT CASE WHEN file_id_supp = TRUE AND biosample_id IS NOT NULL THEN 'Supplemental File Linked to BioSample' WHEN (file_id_supp = TRUE AND int_file_id_supp = FALSE) OR (file_id_supp = FALSE AND int_file_id_supp = TRUE) THEN 'Supplemental File Linked to Non-Supplemental File' ELSE 'No Issue Found' END AS finding, COUNT(*) AS occurrences
            FROM activity_exp_tagged
            GROUP by finding
            """.format(project=bq_project, dataset=bq_dataset)
        try:
            df = client.query(validation_query).result().to_dataframe()
            records_json = json.loads(df.to_json(orient='records'))
            supp_linked_to_biosample = 0
            supp_linked_to_nonsupp = 0
            non_issue_links = 0
            for record in records_json:
                if record["finding"] == "Supplemental File Linked to BioSample":
                    supp_linked_to_biosample = record["occurrences"]
                elif record["finding"] == "Supplemental File Linked to Non-Supplemental File":
                    supp_linked_to_nonsupp = record["occurrences"]
                else:
                    non_issue_links = record["occurrences"]
            if supp_linked_to_biosample > 0 or supp_linked_to_nonsupp > 0:
                err_msg = f"Failure - Errors found when validating supplementary files flagged in the TDR dataset: Supplemental Files Linked to a Biosample: {str(supp_linked_to_biosample)} Supplemental Files Linked to a Non-Supplemental File: {str(supp_linked_to_nonsupp)} Links with No Issues: {str(non_issue_links)}"
                return err_msg
        except Exception as e:
            return "Failure - BigQuery Error"
        return "Success"  

# Loop through datasets and validate is_supplementary field
dataset_id_list = [
'8b2b1c92-66cf-403c-8eb0-03b523d1550e',
'595b6755-e7ae-4e83-af2e-693c089aeec3',
'84ac0d05-4be5-43e9-973e-ef999144d802',
'732eaae3-b509-4a7a-8961-09d861e55253',
'544f643d-b19f-4aa0-a6ec-a90e1a8681d6',
'f85ea65e-1943-4bd6-a541-71c5d8465ca9',
'280c5d6f-39a3-4d1d-aad2-a174451cd9b2',
]
results = []
for dataset_id in dataset_id_list:
    logging.info(f"Validating dataset_id: {dataset_id}")
    status = validate_supp_file_flg(dataset_id) 
    results.append([dataset_id, status])
    results_df = pd.DataFrame(results, columns = ["dataset_id", "validation_status"])
display(results_df)

# Attempt to populate anvil_donor.organism_type

## Script to patch dataset

In [None]:
def populate_organism_type(dataset_id):
    logging.info(f"Processing anvil_donor.organism_type for Dataset ID = {dataset_id}")
    
    # Retrieve dataset information
    logging.info("Retrieving necessary information from TDR.")
    api_client = utils.refresh_tdr_api_client()
    datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
    try:
        response = datasets_api.retrieve_dataset(id=dataset_id, include=["ACCESS_INFORMATION"]).to_dict()
        bq_project = response["access_information"]["big_query"]["project_id"]
        bq_dataset = response["access_information"]["big_query"]["dataset_name"]
    except Exception as e:
        logging.error("Error retrieving information from TDR. Exiting function. Error: {}".format(e))
        return "Failure"

    # Re-process anvil_donor data to include organism_type (where available)
    logging.info("Re-processing existing anvil_donor data to include organism_type value.")
    client = bigquery.Client()
    target_file = "anvil_donor.json"
    destination_dir = "ingest_pipeline/output/transformed/anvil/{}/table_data".format(dataset_id)
    query = """SELECT donor_id, 
    (SELECT MAX(CASE WHEN REGEXP_CONTAINS(value, '(h37|h38|h39|hg16|hg17|hg18|hg19|hs37|hs38|b37)') THEN 'Homo sapiens' END) AS organism_type FROM `{project}.{dataset}.workspace_attributes` WHERE attribute = 'library:reference') AS organism_type,
    part_of_dataset_id, phenotypic_sex, reported_ethnicity, genetic_ancestry, source_datarepo_row_ids
    FROM `{project}.{dataset}.anvil_donor`""".format(project=bq_project, dataset=bq_dataset)
    try:
        df = client.query(query).result().to_dataframe()
        records_json = df.to_json(orient='records') 
        records_list = json.loads(records_json)
        records_cnt = len(records_list)
        with open(target_file, 'w') as outfile:
            for idx, val in enumerate(records_list):
                json.dump(val, outfile)
                if idx < (records_cnt - 1):
                    outfile.write('\n')
        !gsutil cp $target_file $ws_bucket/$destination_dir/ 2> stdout
        !rm $target_file
        logging.info("Successfully created new anvil_donor.json file.")
    except Exception as e:
        logging.error("Error creating new anvil_donor.json file. Exiting function. Error: {}".format(str(e)))
        return "Failure"

    # Ingest updated anvil_donor data
    logging.info("Ingesting updated anvil_donor data into TDR dataset.")
    source_full_file_path = "{}/{}/{}".format(ws_bucket, destination_dir, "anvil_donor.json")
    ingest_request = {
        "table": "anvil_donor",
        "profile_id": "e0e03e48-5b96-45ec-baa4-8cc1ebf74c61",
        "ignore_unknown_values": True,
        "resolve_existing_files": True,
        "updateStrategy": "replace",
        "format": "json",
        "load_tag": "Ingest for {}".format(dataset_id),
        "path": source_full_file_path
    }
    attempt_counter = 0
    while True:
        try:
            api_client = utils.refresh_tdr_api_client()
            datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
            ingest_request_result, job_id = utils.wait_for_tdr_job(datasets_api.ingest_dataset(id=dataset_id, ingest=ingest_request))
            logging.info("Ingest from file anvil_donor.json succeeded: {}".format(str(ingest_request_result)[0:1000]))
            break
        except Exception as e:
            logging.error("Error on Dataset Ingest: {}".format(str(e)))
            attempt_counter += 1
            if attempt_counter < 2:
                logging.info("Retrying Dataset Ingest (attempt #{})...".format(str(attempt_counter)))
                sleep(10)
                continue
            else:
                logging.error("Maximum number of retries exceeded. Exiting function.")
                return "Failure"

    # Return success message if no failures recorded
    logging.info("Function completed successfully.")
    return "Success"

# Loop through datasets and process supplementary_file_flag
dataset_id_list = [
'd74b26d5-24bb-4696-84c3-bcd1f5f90b08',
]
results = []
for dataset_id in dataset_id_list:
    status = populate_organism_type(dataset_id) 
    results.append([dataset_id, status])
    results_df = pd.DataFrame(results, columns = ["dataset_id", "run_status"])
display(results_df)


## Script to examine organism_type population

In [None]:
def validate_organism_type(dataset_id):
    
    # Retrieve dataset information
    src_schema_dict = {}
    api_client = utils.refresh_tdr_api_client()
    datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
    try:
        response = datasets_api.retrieve_dataset(id=dataset_id, include=["SCHEMA", "ACCESS_INFORMATION"]).to_dict()
        src_schema_dict["tables"] = response["schema"]["tables"]
        bq_project = response["access_information"]["big_query"]["project_id"]
        bq_dataset = response["access_information"]["big_query"]["dataset_name"]
    except Exception as e:
        return "Failure - Issue Retrieving Dataset Info"
    
    # Determine if field exists for dataset, continue if so, fail otherwise
    client = bigquery.Client()
    query = """SELECT COUNT(organism_type) AS populated_cnt
                FROM `{project}.{dataset}.anvil_donor`""".format(project=bq_project, dataset=bq_dataset)
    try:
        df = client.query(query).result().to_dataframe()
        if df["populated_cnt"].values[0] > 0:
            return "Success - Field Populated"
        else:
            return "Success - Field Not Populated"
    except Exception as e:
        return "Failure - BigQuery Error"

# Loop through datasets and validate is_supplementary field
dataset_id_list = [
'd74b26d5-24bb-4696-84c3-bcd1f5f90b08',
]
results = []
for dataset_id in dataset_id_list:
    status = validate_organism_type(dataset_id) 
    results.append([dataset_id, status])
    results_df = pd.DataFrame(results, columns = ["dataset_id", "validation_status"])
display(results_df)

# Update references to md5-added files

In [None]:
# Function to collect all datarepo rows for a particular table within a dataset
def collect_all_datarepo_rows(dataset_id, table_name):
    api_client = utils.refresh_tdr_api_client()
    datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
    try:
        response = datasets_api.retrieve_dataset(id=dataset_id, include=["ACCESS_INFORMATION"]).to_dict()
        bq_project = response["access_information"]["big_query"]["project_id"]
        bq_schema = response["access_information"]["big_query"]["dataset_name"]
    except Exception as e:
        logging.error("Error retrieving BQ project and schema: {}".format(str(e)))
    client = bigquery.Client()
    query = "SELECT datarepo_row_id FROM `{project}.{schema}.{table}`".format(project = bq_project, schema = bq_schema, table = table_name)
    try:
        query_job = client.query(query)
        results = [row["datarepo_row_id"] for row in query_job]
        return results
    except Exception as e:
        logging.error("Error retrieving datarepo_row_id list: {}".format(str(e)))
        raise Exception(e)

# Function to delete rows from a dataset
def delete_old_records(dataset_id, table, datarepo_row_ids):
    logging.info(f"Attempting to delete original {table} records.")
    if datarepo_row_ids:
        data_deletion_payload = {
            "deleteType": "soft",
            "specType": "jsonArray",
            "tables": [{
              "tableName": table,
              "jsonArraySpec": {
                "rowIds": datarepo_row_ids
              }
            }]
        }
        api_client = utils.refresh_tdr_api_client()
        datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
        try:
            data_deletion_result, job_id = utils.wait_for_tdr_job(datasets_api.apply_dataset_data_deletion(id=dataset_id, data_deletion_request=data_deletion_payload))
            logging.info("Result: {}".format(data_deletion_result))
        except Exception as e:
            logging.info("Error: {}".format(str(e)))
            raise Exception(e)
    else:
        logging.info("No datarepo_row_ids specified for deletion.")

def ingest_updated_records(profile_id, dataset_id, table, records_dict):
    logging.info(f"Submitting ingest for updated {table} records.")
    
    # Build, submit, and monitor ingest request
    ingest_request = {
        "table": table,
        "profile_id": profile_id,
        "ignore_unknown_values": True,
        "resolve_existing_files": True,
        "updateStrategy": "replace",
        "format": "array",
        "bulkMode": False,
        "load_tag": f"File ref fields patch for {table} in {dataset_id}",
        "records": records_dict
    }
    attempt_counter = 0
    while True:
        try:
            api_client = utils.refresh_tdr_api_client()
            datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
            ingest_request_result, job_id = utils.wait_for_tdr_job(datasets_api.ingest_dataset(id=dataset_id, ingest=ingest_request))
            logging.info("Ingest succeeded: {}".format(str(ingest_request_result)[0:1000]))
            status = "Success"
            return
        except Exception as e:
            logging.error("Error on ingest: {}".format(str(e)))
            attempt_counter += 1
            if attempt_counter < 1:
                logging.info("Retrying ingest (attempt #{})...".format(str(attempt_counter)))
                sleep(10)
                continue
            else:
                logging.error("Maximum number of retries exceeded. Logging error.")
                status = "Error"
                raise Exception(e)
                
def update_recs_w_file_refs(dataset_id):
    logging.info(f"Processing md5-added files for Dataset ID = {dataset_id}")

    ## Retrieve dataset information
    logging.info("Retrieving necessary information from TDR.")
    src_schema_dict = {}
    api_client = utils.refresh_tdr_api_client()
    datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
    try:
        response = datasets_api.retrieve_dataset(id=dataset_id, include=["SCHEMA", "ACCESS_INFORMATION"]).to_dict()
        src_schema_dict["tables"] = response["schema"]["tables"]
        bq_project = response["access_information"]["big_query"]["project_id"]
        bq_dataset = response["access_information"]["big_query"]["dataset_name"]
    except Exception as e:
        logging.error("Error retrieving information from TDR. Exiting function. Error: {}".format(e))
        return "Failure - Pre-processing"

    ## Parse TDR schema to identify file reference fields
    table_dict = {}
    for table in src_schema_dict["tables"]:
        if table["name"] in ["file_inventory", "anvil_file"]:
            continue
        else:
            col_list = []
            for column in table["columns"]:
                if column["datatype"] == "fileref":
                    col_list.append([column["name"], column["array_of"]])
            if col_list:
                table_dict[table["name"]] = col_list

    ## Loop through tables and re-process impacted records
    for table in table_dict.keys():
        logging.info(f"Processing updates for {table}.")
        # Retrieve relevant records from BigQuery
        col_list = []
        old_cols = ""
        new_cols = ""
        join_clause = ""
        where_clause = ""
        for idx, col in enumerate(table_dict[table]):
            column_name = col[0]
            col_list.append(column_name)
            if idx == 0: 
                old_cols += column_name
                where_clause += f"t.{column_name} IN (SELECT file_ref FROM file_list)"
            else:
                old_cols += ", " + column_name
                where_clause += f" OR t.{column_name} IN (SELECT file_ref FROM file_list)"
            new_cols += f", CASE WHEN t{idx}.source_name IS NOT NULL THEN TO_JSON(STRUCT(t{idx}.source_name AS sourcePath, t{idx}.target_path AS targetPath)) END AS {column_name}"
            join_clause += f" LEFT JOIN load_hist t{idx} ON t.{column_name} = t{idx}.file_id"

        query = """WITH 
            file_list AS (SELECT * FROM `{project}.{dataset}.file_inventory` WHERE md5_hash IS NULL),
            load_hist AS (SELECT * FROM `{project}.{dataset}.datarepo_load_history` WHERE state = 'succeeded')
            SELECT t.* EXCEPT({old_cols}){new_cols}
            FROM `{project}.{dataset}.{table}` t {joins} WHERE {where}""".format(project=bq_project, dataset=bq_dataset, table=table, old_cols=old_cols, new_cols=new_cols, joins=join_clause, where=where_clause)
        try:
            client = bigquery.Client()
            res = client.query(query).result()
            if res.total_rows > 0:
                logging.info(f"{res.total_rows} records to process.")
                df = res.to_dataframe()
                records_json = df.to_json(orient='records')
                records_list = json.loads(records_json)
            else:
                logging.info("No records to process.")
                records_list = []
        except Exception as e:
            logging.error(f"Error retrieving update records from BigQuery: {str(e)}")
            return "Failure - Table Processing"
        # Ingest updated records back to TDR dataset
        try:
            datarepo_row_ids = []
            for record in records_list:
                datarepo_row_ids.append(record.pop("datarepo_row_id", None))
                for col in col_list:
                    record[col] = json.loads(record[col])
            if records_list:
                ingest_updated_records("e0e03e48-5b96-45ec-baa4-8cc1ebf74c61", dataset_id, table, records_list)
                delete_old_records(dataset_id, table, datarepo_row_ids)
        except Exception as e:
            logging.error(f"Error replacing TDR records: {str(e)}")
            return "Failure - Table Processing"
        
    ## Re-process file_inventory
    logging.info(f"Processing updates for file_inventory.")
    # Retrieve relevant records from BigQuery
    query = """WITH 
        file_list AS (SELECT file_ref FROM `{project}.{dataset}.file_inventory` WHERE md5_hash IS NULL),
        load_hist AS (SELECT * FROM `{project}.{dataset}.datarepo_load_history` WHERE state = 'succeeded')
        SELECT t1.*, CASE WHEN t2.source_name IS NOT NULL THEN TO_JSON(STRUCT(t2.source_name AS sourcePath, t2.target_path AS targetPath)) END AS file_ref
        FROM `{project}.{dataset}.file_inventory` t1
          INNER JOIN load_hist t2 ON t1.file_ref = t2.file_id
        WHERE file_ref IN (SELECT file_ref FROM file_list)""".format(project=bq_project, dataset=bq_dataset)
    try:
        client = bigquery.Client()
        res = client.query(query).result()
        if res.total_rows > 0:
            logging.info(f"{res.total_rows} records to process.")
            df = res.to_dataframe()
            records_json = df.to_json(orient='records')
            records_list = json.loads(records_json)
        else:
            logging.info("No records to process.")
            records_list = []
    except Exception as e:
        logging.error(f"Error retrieving update records from BigQuery: {str(e)}")
        return "Failure - File Inventory Processing"
    # Loop through records and update md5_hash from GCS metadata
    try:
        storage_client = storage.Client()
        datarepo_row_ids = []
        for record in records_list:
            bucket = re.match('gs:\/\/([a-z0-9\-]+)', record["uri"]).group(1)
            obj = re.match('gs:\/\/[a-z0-9\-]+\/([A-Za-z0-9\-_\/\.]+)', record["uri"]).group(1)
            bucket = storage_client.bucket(bucket, user_project="anvil-datastorage")
            blob = bucket.get_blob(obj)
            record["md5_hash"] = blob.md5_hash
            datarepo_row_ids.append(record.pop("datarepo_row_id", None))
    except Exception as e:
        logging.error(f"Error retrieving file metadata from GCS: {str(e)}")
        return "Failure - File Inventory Processing"
    # Ingest updated records back to TDR dataset
    try:
        if records_list:
            ingest_updated_records("e0e03e48-5b96-45ec-baa4-8cc1ebf74c61", dataset_id, "file_inventory", records_list)
            delete_old_records(dataset_id, "file_inventory", datarepo_row_ids)         
    except Exception as e:
        logging.error(f"Error replacing TDR records: {str(e)}")
        return "Failure - File Inventory Processing"

    ## Empty anvil_% tables
    logging.info("Clearing out existing anvil_% tables")
    table_list = ["anvil_activity", "anvil_alignmentactivity", "anvil_antibody", "anvil_assayactivity", "anvil_biosample", "anvil_diagnosis", "anvil_donor", "anvil_file", "anvil_sequencingactivity", "anvil_variantcallingactivity"]
    for table in table_list:
        try:
            datarepo_row_ids = collect_all_datarepo_rows(dataset_id, table)
            if datarepo_row_ids:
                delete_old_records(dataset_id, table, datarepo_row_ids)
        except Exception as e:
            logging.error(f"Error clearing out existing anvil_% records: {str(e)}")
            return "Failure - anvil_% Record Deletion"
    
    ## Re-run T pipeline without validation
    params = {}
    params["ws_name"] = ws_name
    params["ws_project"] = ws_project
    params["ws_bucket"] = ws_bucket
    params["ws_bucket_name"] = ws_bucket_name
    params["profile_id"] = "e0e03e48-5b96-45ec-baa4-8cc1ebf74c61" 
    params["mapping_target"] = "anvil"
    params["skip_transforms"] = False
    params["transform_list_override"] = [] # Leave empty to run transforms for all files, otherwise populate with target table names 
    params["skip_schema_extension"] = False
    params["skip_ingests"] = False
    params["ingest_list_override"] = [] # Leave empty to run ingests for all files, otherwise populate with target table names
    params["skip_file_relation_inference"] = False
    params["skip_dangling_fk_resolution"] = False
    params["skip_supplementary_file_identification"] = False
    params["skip_snapshot_creation"] = False
    params["snapshot_readers_list"] = ["azul-anvil-prod@firecloud.org"] # Include "auth-domain" to add the auth domain(s) as a reader (if one exists)
    params["skip_data_validation"] = True
    try:
        api_client = utils.refresh_tdr_api_client()
        datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
        dataset_info = datasets_api.retrieve_dataset(id=dataset_id, include=["SCHEMA", "ACCESS_INFORMATION", "PROPERTIES"]).to_dict()
        dataset_name = dataset_info["name"]
        phs_id = dataset_info["phs_id"]
        consent_name = dataset_info["properties"]["consent_name"]
        auth_domains = dataset_info["properties"]["auth_domains"]
        src_workspaces = dataset_info["properties"]["source_workspaces"]
    except:
        dataset_name = ""
        return "Failure - Dataset Retrieval for T Pipeline"
    if dataset_name:
        params["dataset_id"] = dataset_id
        params["dataset_name"] = dataset_name
        params["phs_id"] = phs_id
        params["consent_name"] = consent_name
        params["auth_domains"] = auth_domains
        utils.run_t_pipeline(params)
    
    # Return success message if no failures recorded
    logging.info("Function completed successfully.")
    return "Success"

# Loop through datasets and process md5 updates
dataset_id_list = [
'700303c2-fcef-48a5-9900-096bf34e2d83',
'a715c70d-da92-43ee-a851-1a27277909a2',
]
results = []
for dataset_id in dataset_id_list:
    status = update_recs_w_file_refs(dataset_id) 
    results.append([dataset_id, status])
    results_df = pd.DataFrame(results, columns = ["dataset_id", "run_status"])
display(results_df)


In [None]:
# # Testing
# dataset_id = 'bc6075ac-5cfe-4613-8601-36ceb614939e'

# logging.info(f"Processing md5-added files for Dataset ID = {dataset_id}")

# ## Retrieve dataset information
# logging.info("Retrieving necessary information from TDR.")
# src_schema_dict = {}
# api_client = utils.refresh_tdr_api_client()
# datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
# try:
#     response = datasets_api.retrieve_dataset(id=dataset_id, include=["SCHEMA", "ACCESS_INFORMATION"]).to_dict()
#     src_schema_dict["tables"] = response["schema"]["tables"]
#     bq_project = response["access_information"]["big_query"]["project_id"]
#     bq_dataset = response["access_information"]["big_query"]["dataset_name"]
# except Exception as e:
#     logging.error("Error retrieving information from TDR. Exiting function. Error: {}".format(e))
#     #return "Failure - Pre-processing"

# ## Parse TDR schema to identify file reference fields
# table_dict = {}
# for table in src_schema_dict["tables"]:
#     if table["name"] in ["file_inventory", "anvil_file"]:
#         continue
#     else:
#         col_list = []
#         for column in table["columns"]:
#             if column["datatype"] == "fileref":
#                 col_list.append([column["name"], column["array_of"]])
#         if col_list:
#             table_dict[table["name"]] = col_list

# ## Loop through tables and re-process impacted records
# for table in table_dict.keys():
#     logging.info(f"Processing updates for {table}.")
#     # Retrieve relevant records from BigQuery
#     col_list = []
#     old_cols = ""
#     new_cols = ""
#     join_clause = ""
#     where_clause = ""
#     for idx, col in enumerate(table_dict[table]):
#         column_name = col[0]
#         col_list.append(column_name)
#         if idx == 0: 
#             old_cols += column_name
#             where_clause += f"t.{column_name} IN (SELECT file_ref FROM file_list)"
#         else:
#             old_cols += ", " + column_name
#             where_clause += f" OR t.{column_name} IN (SELECT file_ref FROM file_list)"
#         new_cols += f", CASE WHEN t{idx}.source_name IS NOT NULL THEN TO_JSON(STRUCT(t{idx}.source_name AS sourcePath, t{idx}.target_path AS targetPath)) END AS {column_name}"
#         join_clause += f" LEFT JOIN load_hist t{idx} ON t.{column_name} = t{idx}.file_id"

#     query = """WITH 
#         file_list AS (SELECT * FROM `{project}.{dataset}.file_inventory` WHERE md5_hash IS NULL),
#         load_hist AS (SELECT * FROM `{project}.{dataset}.datarepo_load_history` WHERE state = 'succeeded')
#         SELECT t.* EXCEPT({old_cols}){new_cols}
#         FROM `{project}.{dataset}.{table}` t {joins} WHERE {where}""".format(project=bq_project, dataset=bq_dataset, table=table, old_cols=old_cols, new_cols=new_cols, joins=join_clause, where=where_clause)
#     try:
#         client = bigquery.Client()
#         res = client.query(query).result()
#         if res.total_rows > 0:
#             logging.info(f"{res.total_rows} records to process.")
#             df = res.to_dataframe()
#             records_json = df.to_json(orient='records')
#             records_list = json.loads(records_json)
#         else:
#             logging.info("No records to process.")
#             records_list = []
#     except Exception as e:
#         logging.error(f"Error retrieving update records from BigQuery: {str(e)}")
#         break
#         #return "Failure - Table Processing"
#     # Ingest updated records back to TDR dataset
#     try:
#         datarepo_row_ids = []
#         for record in records_list:
#             datarepo_row_ids.append(record.pop("datarepo_row_id", None))
#             for col in col_list:
#                 record[col] = json.loads(record[col])
#         if records_list:
#             ingest_updated_records("e0e03e48-5b96-45ec-baa4-8cc1ebf74c61", dataset_id, table, records_list)
#             delete_old_records(dataset_id, table, datarepo_row_ids)
#     except Exception as e:
#         logging.error(f"Error replacing TDR records: {str(e)}")
#         break
#         #return "Failure - Table Processing"

# # ## Re-process file_inventory
# # logging.info(f"Processing updates for file_inventory.")
# # # Retrieve relevant records from BigQuery
# # query = """WITH 
# #     file_list AS (SELECT file_ref FROM `{project}.{dataset}.file_inventory` WHERE md5_hash IS NULL),
# #     load_hist AS (SELECT * FROM `{project}.{dataset}.datarepo_load_history` WHERE state = 'succeeded')
# #     SELECT t1.*, CASE WHEN t2.source_name IS NOT NULL THEN TO_JSON(STRUCT(t2.source_name AS sourcePath, t2.target_path AS targetPath)) END AS file_ref
# #     FROM `{project}.{dataset}.file_inventory` t1
# #       INNER JOIN load_hist t2 ON t1.file_ref = t2.file_id
# #     WHERE file_ref IN (SELECT file_ref FROM file_list)""".format(project=bq_project, dataset=bq_dataset)
# # try:
# #     client = bigquery.Client()
# #     res = client.query(query).result()
# #     if res.total_rows > 0:
# #         logging.info(f"{res.total_rows} records to process.")
# #         df = res.to_dataframe()
# #         records_json = df.to_json(orient='records')
# #         records_list = json.loads(records_json)
# #     else:
# #         logging.info("No records to process.")
# #         records_list = []
# # except Exception as e:
# #     logging.error(f"Error retrieving update records from BigQuery: {str(e)}")
# #     #return "Failure - File Inventory Processing"
# # # Loop through records and update md5_hash from GCS metadata
# # try:
# #     storage_client = storage.Client()
# #     datarepo_row_ids = []
# #     for record in records_list:
# #         bucket = re.match('gs:\/\/([a-z0-9\-]+)', record["uri"]).group(1)
# #         obj = re.match('gs:\/\/[a-z0-9\-]+\/([A-Za-z0-9\-_\/\.]+)', record["uri"]).group(1)
# #         bucket = storage_client.bucket(bucket, user_project="anvil-datastorage")
# #         blob = bucket.get_blob(obj)
# #         record["md5_hash"] = blob.md5_hash
# #         datarepo_row_ids.append(record.pop("datarepo_row_id", None))
# # except Exception as e:
# #     logging.error(f"Error retrieving file metadata from GCS: {str(e)}")
# #     #return "Failure - File Inventory Processing"
# # # Ingest updated records back to TDR dataset
# # try:
# #     if records_list:
# #         ingest_updated_records("e0e03e48-5b96-45ec-baa4-8cc1ebf74c61", dataset_id, "file_inventory", records_list)
# #         delete_old_records(dataset_id, "file_inventory", datarepo_row_ids)         
# # except Exception as e:
# #     logging.error(f"Error replacing TDR records: {str(e)}")
# #     #return "Failure - File Inventory Processing"

# # ## Empty anvil_% tables
# # logging.info("Clearing out existing anvil_% tables")
# # table_list = ["anvil_activity", "anvil_alignmentactivity", "anvil_antibody", "anvil_assayactivity", "anvil_biosample", "anvil_diagnosis", "anvil_donor", "anvil_file", "anvil_sequencingactivity", "anvil_variantcallingactivity"]
# # for table in table_list:
# #     try:
# #         datarepo_row_ids = collect_all_datarepo_rows(dataset_id, table)
# #         if datarepo_row_ids:
# #             delete_old_records(dataset_id, table, datarepo_row_ids)
# #     except Exception as e:
# #         logging.error(f"Error clearing out existing anvil_% records: {str(e)}")
# #         break
# #         #return "Failure - anvil_% Record Deletion"

# # ## Re-run T pipeline without validation
# # params = {}
# # params["ws_name"] = ws_name
# # params["ws_project"] = ws_project
# # params["ws_bucket"] = ws_bucket
# # params["ws_bucket_name"] = ws_bucket_name
# # params["profile_id"] = "e0e03e48-5b96-45ec-baa4-8cc1ebf74c61" 
# # params["mapping_target"] = "anvil"
# # params["skip_transforms"] = False
# # params["transform_list_override"] = [] # Leave empty to run transforms for all files, otherwise populate with target table names 
# # params["skip_schema_extension"] = False
# # params["skip_ingests"] = False
# # params["ingest_list_override"] = [] # Leave empty to run ingests for all files, otherwise populate with target table names
# # params["skip_file_relation_inference"] = False
# # params["skip_dangling_fk_resolution"] = False
# # params["skip_supplementary_file_identification"] = False
# # params["skip_snapshot_creation"] = False
# # params["snapshot_readers_list"] = ["azul-anvil-prod@firecloud.org"] # Include "auth-domain" to add the auth domain(s) as a reader (if one exists)
# # params["skip_data_validation"] = True
# # try:
# #     api_client = utils.refresh_tdr_api_client()
# #     datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
# #     dataset_info = datasets_api.retrieve_dataset(id=dataset_id, include=["SCHEMA", "ACCESS_INFORMATION", "PROPERTIES"]).to_dict()
# #     dataset_name = dataset_info["name"]
# #     phs_id = dataset_info["phs_id"]
# #     consent_name = dataset_info["properties"]["consent_name"]
# #     auth_domains = dataset_info["properties"]["auth_domains"]
# #     src_workspaces = dataset_info["properties"]["source_workspaces"]
# # except:
# #     dataset_name = ""
# #     return "Failure - Dataset Retrieval for T Pipeline"
# # if dataset_name:
# #     params["dataset_id"] = dataset_id
# #     params["dataset_name"] = dataset_name
# #     params["phs_id"] = phs_id
# #     params["consent_name"] = consent_name
# #     params["auth_domains"] = auth_domains
# #     utils.run_t_pipeline(params)

# # Return success message if no failures recorded
# logging.info("Function completed successfully.")
# #return "Success"


In [None]:
# for idx, record in enumerate(records_list):
#     if record["library_2_estimated_library_size"]:
#         print(str(idx) + " - " + str(record["library_2_estimated_library_size"]))

In [None]:
# records_list[50]

# Add new supplementary workspace files to TDR dataset

## Script to identify new supplementary files and ingest them to TDR dataset

In [None]:
def ingest_supplementary_files(dataset_id):
    
    # Retrieve dataset details
    logging.info("Retrieving dataset details.")
    api_client = utils.refresh_tdr_api_client()
    datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
    dataset_details = datasets_api.retrieve_dataset(id=dataset_id, include=["ACCESS_INFORMATION", "PROPERTIES"]).to_dict()
    try:
        source_workspaces = dataset_details["properties"]["source_workspaces"]
        bq_project = dataset_details["access_information"]["big_query"]["project_id"]
        bq_schema = dataset_details["access_information"]["big_query"]["dataset_name"]
    except Exception as e:
        return "Failure - Issue Retrieving Dataset Info"
    
    # Use source workspace(s) to find workspace bucket(s) to look for new files
    logging.info("Determining source workspace bucket(s).")
    data_files_src_buckets = {}
    for ws in source_workspaces:
        try:
            ws_attributes = utils.get_workspace_attributes("anvil-datastorage", ws)
            src_bucket = ws_attributes["bucketName"] if ws_attributes.get("bucketName") else ""
            if not src_bucket:
                return "Failure - Issue Retrieving Source Buckets"
            elif src_bucket not in data_files_src_buckets:
                data_files_src_buckets[src_bucket] = {
                    "include_dirs": [],
                    "exclude_dirs": []
                }
        except Exception as e:
            return "Failure - Issue Retrieving Source Buckets"
    
    # Pull existing file inventory from BigQuery
    logging.info("Pulling existing file inventory records.")
    client = bigquery.Client()
    query = """SELECT uri FROM `{project}.{schema}.file_inventory`""".format(project = bq_project, schema = bq_schema)
    file_list = []
    try:
        output = client.query(query).result()
        if output.total_rows > 0:
            for row in output:
                file_list.append(row.uri)
    except Exception as e:
            return "Failure - Issue Retrieving Existing File Inventory Records"
        
    # Build file inventory from workspace bucket(s)
    logging.info("Building new file inventory.")
    params = {}
    params["data_files_src_buckets"] = data_files_src_buckets
    params["google_project"] = "terra-349c8d95"
    params["file_inventory_dir"] = "ingest_pipeline/input/temp/data_files/file_inventory"
    inventory, retry_count = bfi.build_inventory(params)
    
    # Diff files to ingest
    logging.info("Diffing new and existing file inventory records.")
    ingest_list = []
    for file in inventory:
        if file["uri"] not in file_list:
            ingest_list.append(file)
    df_inventory = pd.DataFrame(ingest_list)
    records_dict = df_inventory.to_dict(orient="records")
    return records_dict
    
    # Build, submit, and monitor ingest request
    ingest_request = {
        "table": "file_inventory",
        "profile_id": "e0e03e48-5b96-45ec-baa4-8cc1ebf74c61",
        "ignore_unknown_values": True,
        "resolve_existing_files": True,
        "updateStrategy": "replace",
        "format": "array",
        "load_tag": "Supplementary file ingest for {}".format(dataset_id),
        "records": records_dict
    }
    attempt_counter = 0
    while True:
        try:
            api_client = utils.refresh_tdr_api_client()
            datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
            ingest_request_result, job_id = utils.wait_for_tdr_job(datasets_api.ingest_dataset(id=dataset_id, ingest=ingest_request))
            return "Success"
        except Exception as e:
            logging.error("Error on file relationships inference ingest: {}".format(str(e)))
            attempt_counter += 1
            if attempt_counter < 2:
                sleep(10)
                continue
            else:
                return f"Failure - Ingest error: {str(e)}"
    
# # Loop through datasets and ingest additional files if necessary
# dataset_id_list = [
# 'd74b26d5-24bb-4696-84c3-bcd1f5f90b08',
# ]
# results = []
# for dataset_id in dataset_id_list:
#     status = ingest_supplementary_files(dataset_id) 
#     results.append([dataset_id, status])
#     results_df = pd.DataFrame(results, columns = ["dataset_id", "run_status"])
# display(results_df)


In [None]:
dataset_id = 'bf9108b6-bebc-4b3b-8517-6a2cce5f7d89'

# Retrieve dataset details
logging.info("Retrieving dataset details.")
api_client = utils.refresh_tdr_api_client()
datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
dataset_details = datasets_api.retrieve_dataset(id=dataset_id, include=["ACCESS_INFORMATION", "PROPERTIES"]).to_dict()
try:
    source_workspaces = dataset_details["properties"]["source_workspaces"]
    bq_project = dataset_details["access_information"]["big_query"]["project_id"]
    bq_schema = dataset_details["access_information"]["big_query"]["dataset_name"]
except Exception as e:
    print("Failure - Issue Retrieving Dataset Info") 

# Use source workspace(s) to find workspace bucket(s) to look for new files
logging.info("Determining source workspace bucket(s).")
data_files_src_buckets = {}
for ws in source_workspaces:
    try:
        ws_attributes = utils.get_workspace_attributes("anvil-datastorage", ws)
        src_bucket = ws_attributes["bucketName"] if ws_attributes.get("bucketName") else ""
        if not src_bucket:
            print("Failure - Issue Retrieving Source Buckets")
        elif src_bucket not in data_files_src_buckets:
            data_files_src_buckets[src_bucket] = {
                "include_dirs": [],
                "exclude_dirs": []
            }
    except Exception as e:
        print("Failure - Issue Retrieving Source Buckets")

# Pull existing file inventory from BigQuery
logging.info("Pulling existing file inventory records.")
client = bigquery.Client()
query = """SELECT uri FROM `{project}.{schema}.file_inventory`""".format(project = bq_project, schema = bq_schema)
print(query)
file_list = []
try:
    output = client.query(query).result()
    if output.total_rows > 0:
        for row in output:
            file_list.append(row.uri)
except Exception as e:
        print("Failure - Issue Retrieving Existing File Inventory Records")

# Build file inventory from workspace bucket(s)
logging.info("Building new file inventory.")
params = {}
params["data_files_src_buckets"] = data_files_src_buckets
params["google_project"] = "terra-349c8d95"
params["file_inventory_dir"] = "ingest_pipeline/input/temp/data_files/file_inventory"
inventory, retry_count = bfi.build_inventory(params)

# Diff files to ingest
logging.info("Diffing new and existing file inventory records.")
ingest_list = []
for file in inventory:
    if file["uri"] not in file_list:
        ingest_list.append(file)
df_inventory = pd.DataFrame(ingest_list)
records_list = df_inventory.to_dict(orient="records")
records_cnt = len(records_list)
logging.info(f"New file inventory records to ingest: {records_cnt}")

# Break records to ingest into chunks if necessary
chunk_size = 100000
chunk_cnt = math.ceil(records_cnt/chunk_size)
for i in range(0, chunk_cnt):
    if i == 0:
        start_row = 0
        end_row = chunk_size
    else:
        start_row = (i*chunk_size) + 1
        end_row = min((i+1)*chunk_size, records_cnt)
    # Write out chunk to file for ingest
    destination_file = "file_inventory_" + str(i) + ".json"
    with open(destination_file, "w") as outfile:
        for idx, val in enumerate(records_list):
            if idx >= start_row and idx <= end_row:
                json.dump(val, outfile)
                if idx < end_row:
                    outfile.write("\n")
    !gsutil cp $destination_file $ws_bucket/ingest_pipeline/input/temp 2> stdout   
    # Build, submit, and monitor ingest request
    logging.info(f"Ingesting new file inventory records into TDR (chunk #{i}).")
    ingest_request = {
        "table": "file_inventory",
        "profile_id": "e0e03e48-5b96-45ec-baa4-8cc1ebf74c61",
        "ignore_unknown_values": True,
        "resolve_existing_files": True,
        "updateStrategy": "replace",
        "format": "json",
        "load_tag": "Supplementary file ingest for {}".format(dataset_id),
        "bulkMode": True,
        "path": f"{ws_bucket}/ingest_pipeline/input/temp/{destination_file}"
    }
    attempt_counter = 0
    while True:
        try:
            api_client = utils.refresh_tdr_api_client()
            datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
            ingest_request_result, job_id = utils.wait_for_tdr_job(datasets_api.ingest_dataset(id=dataset_id, ingest=ingest_request))
            print("Success")
            break
        except Exception as e:
            logging.error("Error on new file inventory records ingest: {}".format(str(e)))
            attempt_counter += 1
            if attempt_counter < 2:
                sleep(10)
                continue
            else:
                print(f"Failure - Ingest error (chunk #{i}): {str(e)}")
                break