# Imports

In [None]:
# Install additional modules (one time effort per cloud environment)
!pip install --upgrade import_ipynb data_repo_client urllib3 xmltodict

In [1]:
## imports and environment variables

# Imports
import import_ipynb
import pandas as pd
import os
import re
import json
import data_repo_client
from google.cloud import bigquery
import ingest_pipeline_utilities as utils
import build_file_inventory as bfi
import logging
from time import sleep
import datetime
from google.cloud import storage
import math

# Configure logging format
logging.basicConfig(format="%(asctime)s - %(levelname)s: %(message)s", datefmt="%m/%d/%Y %I:%M:%S %p", level=logging.INFO)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option("display.max_colwidth", None)
pd.set_option('display.width', 1000)
pd.set_option('display.colheader_justify', 'center')
pd.set_option('display.precision', 3)

# Environment variables
ws_name = os.environ["WORKSPACE_NAME"]
ws_project = os.environ["WORKSPACE_NAMESPACE"]
ws_bucket = os.environ["WORKSPACE_BUCKET"]
ws_bucket_name = re.sub('^gs://', '', ws_bucket)


importing Jupyter notebook from ingest_pipeline_utilities.ipynb
Version 1.0.27: 4/5/2023 2:20pm - Nate Calvanese - Added the drs.anv0 compact identifer to snapshot creation requests
importing Jupyter notebook from source_files_creation.ipynb
Version 1.0.9: 2/25/2023 3:15pm - Nate Calvanese - Replaced FAPI with utils functions
importing Jupyter notebook from build_file_inventory.ipynb
Version 2.0.1: 3/23/2022 8:29pm - Nate Calvanese - Added support for a global file exclusion
importing Jupyter notebook from process_table_data.ipynb
Version: 1.0.9: 3/8/2023 12:09pm - Nate Calvanese - Performance improvements for file ref lookups
importing Jupyter notebook from build_mapping_query.ipynb
Version 1.0.12: 2/21/2023 2:50pm - Nate Calvanese - Added support for $BQ_DATASET substitution variable
importing Jupyter notebook from output_data_validation.ipynb
Version 2.0.6: 2/28/2023 11:33am -- Updated notebook to be usable in dev (removed TDR host hardcoding)
importing Jupyter notebook from resolve

# Create new snapshot

## Script to create new full view snapshot

In [2]:
# Parameters
params = {}
params["profile_id"] = "e0e03e48-5b96-45ec-baa4-8cc1ebf74c61"
params["snapshot_readers_list"] = ["azul-anvil-prod@firecloud.org"]
params["anvil_schema_version"] = "ANV5"

# Loop through datasets and create new snapshot
dataset_id_run_list = [
'0c18589c-6432-4a6c-90ce-985a47a66f39',
'3a72e4b8-afb4-4299-98ec-a9ba9606be06',
'a9a5deef-8dc5-44eb-bc51-46bd6bcab2a6',
'ddc9e3fc-3f8b-4d2f-b4c6-4aade37b2fad',
'8ec3476f-0e76-4b63-970f-e6d5c078a0d2',
'44f83f20-d618-40b5-b2cb-3676b8fe3ad7',
'a5fe75bb-d28c-42fb-aaf8-92fa37b266d2',
'6545d602-e5b4-4dd1-8f6a-64e0a1952ddc',
'6765ce2d-ebc8-4367-8855-c0f8e62cb355',
'278a26cb-a710-4fff-928e-fc2e7084a75a',
'9fc492f3-8d13-47ae-93e9-812c0224f1aa',
'822d381e-cea0-45bb-8fa0-1b7194b4b64b',
'b7fb531e-25a4-427c-9679-b7bdc3d03535',
'74aee36b-f68b-43a4-8ae6-0d2797a1c4ad',
'ccf1d1c4-f9ad-4f12-8592-d61eb26cb4d4',
'9a32e23e-840d-4ba3-8cd9-392f48b8e9d2',
'5069fc2c-b957-4130-adca-6eabae943867',
'1939b7ae-fc6b-42a8-ad5f-dc51a1682a17',
'4e99b8e1-40b9-4fb2-90a0-d85e926ef31e',
'2cda53ba-b852-47e8-8f24-59ab8e9f1d1f',
'128332b6-5060-4ec4-b6a6-f53b54a810be',
'06f05f58-3c83-4f5c-bddd-bed7d2d1d147',
'9d796a02-e2aa-4c15-b8d6-1e90cd736681',
'433e3a09-661a-46a5-96f2-dbb07bdc87f3',
'34fd3b22-ac73-47d2-8849-5877158ec072',
'7ce3270e-b2f2-47f4-a288-639751b2f87f',
'fcb03f4f-e685-4803-aadb-0e8940ff4f37',
'41d12dc1-8718-4439-b409-26cc23573107',
'c2f0e7cf-ac07-48f7-b5f1-497ee6c134b2',
'c4c49fcd-0c20-4cff-841a-cb58f5689c5b',
'0d2c9df2-2c40-4683-a093-df731d033762',
'c5c0893f-b254-4038-8d08-b28ef5a26b5d',
'bcfe7f3b-3e63-45de-9e4d-144f9fc63753',
'63b229b5-e7c8-4fd3-bbc8-ecf344da70d4',
'737d39b8-2f99-4eac-bcda-a03996e08939',
'31e61d00-61cc-46f2-a793-8ea8dfbb0832',
'254ffffc-2bd6-4b2e-905b-a8c54c348cd0',
'd6291444-8c3c-470c-b28c-7cf1d5c7aad8',
'ae34e63e-13af-48b8-8b72-8137289091b3',
'3fd2204c-8654-4af7-832f-c186447262e0',
'd48db47e-acba-4377-b031-f6dfc21f3658',
'575dc7da-58ed-407d-9e88-7b586f28bf65',
'fa278604-7d85-4491-a30d-15c7821f8b00',
'b5c0bf91-9d20-41a2-9dd2-87d0ef0310f9',
'32bc49c6-7583-4613-a72f-5edb12b2a808',
'3eb8ea77-4605-4bb7-90f9-671953abe4a2',
'2b08cb76-061d-44c6-a00f-b43a5421df5e',
'ab7e390a-adc5-4f9e-b317-a216a2904c93',
'd1e6d0e4-d49e-4a16-93c6-7956b2c03414',
'713f8676-8034-4827-bccc-cd6d95b1a4c4',
'b00883d8-9251-435d-aefc-8a703d96d2fb',
'eefbea02-0d65-441e-b455-35aa21d25ba3',
'ff7e3be2-c0ac-4d97-85da-6229bf7585ac',
'7ac92a42-e112-49c3-a8f5-8ad2c7ef5578',
'0701aae2-8661-4eec-84e0-7c8be1c89a18',
'7efb1905-34b4-4f1c-a8a6-8e64b3640a68',
'11a2b088-8c1c-47d2-9c1e-455d457d2f05',
'74608bd9-39e4-4f48-9b7c-1cd9d3c599c9',
'7baf8e8c-de11-452d-b2e1-aad7c08cc18e',
'23b0219d-0820-4017-b942-bda8578e90e2',
'a5f631ea-2b4b-43f2-9ea0-e31f2b11fa27',
'8523489a-f57c-4993-81e4-1ed86a5c092d',
'9cb5ce25-38e8-4628-9ddf-d6aedf5efe0f',
'a647528d-925e-4c02-8825-ff54720c6ee4',
'2c6f63b2-439e-499f-b687-b3fdd88a492e',
'68a916af-2e0c-41bd-8535-c7eacbc2d1b7',
'0e7f31a0-c712-4ebf-ab3a-64c37f43e52a',
'c8b1d323-f352-482e-bf17-82075c23dcee',
'd30f51c7-d642-4e7d-a168-967b9520a80a',
'8d89608c-0d61-4d71-a2e3-9fbc6cda69bf',
'71219f56-551f-4ad4-9a38-cc4aaf8a1e9a',
'48dd6010-77dc-465b-a27c-695e29b57a5e',
'582f5f8d-b96f-490e-b417-ba824baeb06c',
'7e825ee6-7c03-43cc-b0a4-0d9203a30bd9',
'2843292e-e494-4642-90e0-57e5c153f12c',
'4ecbb7c8-0246-47f8-9654-4caca1d52565',
'7e3ea1bd-95ba-4cad-90c8-3eec95be9cc8',
'bbba696b-d023-4bb1-a213-c8bee31e8bae',
'470eee0f-2053-4d9b-9f5e-ca9661a6cc16',
'5c1dc76d-b703-445c-9b38-cc2d00b9ab16',
'74ede771-6781-4980-bfb9-5d853b7cdd6f',
'6c47e282-5d5e-445c-b6bd-c0024946fbe0',
'ff8ffbcf-c932-48c7-8d5e-d995d5680e21',
'9d74b4f0-b2d4-46aa-867a-52fb6102bfdf',
'e34f15f7-c225-4314-a638-90504bb0aa0d',
'c1d222ab-bc0e-4e13-8379-0ee5be9e140e',
'582187a5-ad63-4759-9162-55fa6337eb07',
'd1e8d19a-970d-4ede-b5bc-9cab7237adec',
'9f152896-ebf1-4756-b678-bdf739a92256',
'478aa270-fbd4-4a45-8f63-221b4066168e',
'0b06619d-39d9-4437-8c42-2e415faa634c',
'e9c7ad29-2213-4648-9164-33a07bd42cdb',
'9e3fb02d-dcf6-486f-a42d-89446a852057',
'15ae6390-6f6d-4fd8-9a51-ecf988676c4d',
'3a3100bb-369e-47c1-a77c-2cacb7cf020d',
'2c11b505-17c8-402e-8422-0239accb449d',
'e25a8172-1e34-442c-a45d-583027a2d734',
'a6d7e030-e6c8-4c62-8cb5-165ef54987c4',
'128dce74-fa37-4f2f-8a80-d542edd81a11',
'e5d3e605-67a5-4317-b535-f75432700279',
'7feeb2b1-1926-4968-b6dc-e0a1e4cf8d4a',
'352a503b-41eb-4a84-b257-68d70e55337e',
'703c4bc2-81bf-435a-87fa-21dc9278bad6',
'd01a4268-1bfe-4a2d-a2d4-e296162c406e',
'e4ccd185-2b0c-445d-9c57-0dc45c8f9d7e',
'1b3290fb-4be1-4558-9d66-92746f0f38d4',
'15d41c35-943c-474b-afa6-e1c6d6e4be2b',
'3ef7966a-ec1e-4dba-9d31-cdb33692e78f',
'c9dd3578-01db-4687-9807-4f71368941d1',
'5edcc3db-c676-412a-9506-600959bb81f2',
'677f0bdf-6c5c-462b-8294-3666f777bbc5',
'17a93d6f-8814-431d-855d-efbfbcc8fcda',
'3068f6fa-eada-475c-9571-d38bf1752878',
'ec4d928c-4f52-4581-b8fe-92b6e241f0e1',
'0d9400f9-013e-4e53-aa80-c8cb8022d3b5',
'ac4ecb0d-d56d-4330-b5a9-72acd5fe79a1',
'f8df3502-aeac-47a0-94c8-80758d082829',
'12720dd2-3513-4e0f-8ca0-109cc489e17f',
'c078b6ca-a3f3-4746-8631-9bb00eb5d954',
'47577b01-d7a7-497d-b261-183dade9cd84',
]
results = []
for dataset in dataset_id_run_list:
    dataset_id = dataset
    try:
        api_client = utils.refresh_tdr_api_client()
        datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
        dataset_info = datasets_api.retrieve_dataset(id=dataset_id, include=["SCHEMA", "ACCESS_INFORMATION", "PROPERTIES"]).to_dict()
        dataset_name = dataset_info["name"]
        phs_id = dataset_info["phs_id"]
        consent_name = dataset_info["properties"]["consent_name"]
        auth_domains = dataset_info["properties"]["auth_domains"]
        src_workspaces = dataset_info["properties"]["source_workspaces"]
    except:
        dataset_name = ""
    if dataset_name:
        params["ws_bucket"] = ws_bucket
        params["dataset_id"] = dataset_id
        params["dataset_name"] = dataset_name
        params["phs_id"] = phs_id
        params["consent_name"] = consent_name
        params["auth_domains"] = auth_domains
        params["pipeline_results"] = []
        current_datetime = datetime.datetime.now()
        current_datetime_string = current_datetime.strftime("%Y%m%d%H%M")
        params["snapshot_name"] = params["dataset_name"] + "_" + params["anvil_schema_version"] + "_" + current_datetime_string 
        utils.create_and_share_snapshot(params)
        int_df_results = pd.DataFrame(params["pipeline_results"], columns = ["Dataset", "Time", "Step", "Task", "Status", "Message"])
        errors = int_df_results[int_df_results["Status"].str.contains("Error")]
        if len(errors) > 0:
            results.append([dataset_id, "Error", ""])
        else:
            snapshot_id = re.search("{'id': '([a-z0-9]{8}-[a-z0-9]{4}-[a-z0-9]{4}-[a-z0-9]{4}-[a-z0-9]{12})'", str(int_df_results[int_df_results["Task"]=="Create and Share Snapshot"]["Message"]))[1]
            results.append([dataset_id, "Success", snapshot_id])
results_df = pd.DataFrame(results, columns = ["dataset_id", "run_status", "snapshot_id"])
display(results_df)


04/27/2023 12:54:23 PM - INFO: Creating full-view snapshot.
04/27/2023 12:54:23 PM - INFO: Submitting snapshot request.
TDR Job ID: 1YojuzleSqi6Dum_9fpy6w
04/27/2023 12:56:05 PM - INFO: Snapshot Creation succeeded: {'id': '24c427f6-17b9-4cd1-962f-92a12b090d8a', 'name': 'ANVIL_CCDG_Broad_NP_Epilepsy_USAVAN_HMB_GSO_WES_20221207_ANV5_202304271254', 'description': 'Full view snapshot of ANVIL_CCDG_Broad_NP_Epilepsy_USAVAN_HMB_GSO_WES_20221207', 'createdDate': '2023-04-27T12:54:43.319854Z', 'profileId': 'e0e03e48-5b96-45ec-baa4-8cc1ebf74c61', 'storage': [{'region': 'us-central1', 'cloudResource': 'bigquery', 'cloudPlatform': 'gcp'}, {'region': 'us-east4', 'cloudResource': 'firestore', 'cloudPlatform': 'gcp'}, {'region': 'us-central1', 'cloudResource': 'bucket', 'cloudPlatform': 'gcp'}], 'secureMonitoringEnabled': False, 'consentCode': 'HMB-GSO', 'phsId': '', 'cloudPlatform': 'gcp', 'dataProject': 'datarepo-6308580c', 'storageAccount': None, 'selfHosted': True, 'globalFileIds': True, 'tags':

04/27/2023 01:08:35 PM - INFO: Creating full-view snapshot.
04/27/2023 01:08:35 PM - INFO: Submitting snapshot request.
TDR Job ID: PJQRr42-QXqsVT4NGOkA1w
04/27/2023 01:10:26 PM - INFO: Snapshot Creation succeeded: {'id': 'd0709a13-9701-437d-848f-fbce26b3bf5b', 'name': 'ANVIL_CMG_Broad_Muscle_Laing_WES_20221208_ANV5_202304271308', 'description': 'Full view snapshot of ANVIL_CMG_Broad_Muscle_Laing_WES_20221208', 'createdDate': '2023-04-27T13:08:54.334187Z', 'profileId': 'e0e03e48-5b96-45ec-baa4-8cc1ebf74c61', 'storage': [{'region': 'us-central1', 'cloudResource': 'bigquery', 'cloudPlatform': 'gcp'}, {'region': 'us-east4', 'cloudResource': 'firestore', 'cloudPlatform': 'gcp'}, {'region': 'us-central1', 'cloudResource': 'bucket', 'cloudPlatform': 'gcp'}], 'secureMonitoringEnabled': False, 'consentCode': 'HMB-MDS', 'phsId': '', 'cloudPlatform': 'gcp', 'dataProject': 'datarepo-736a5f1f', 'storageAccount': None, 'selfHosted': True, 'globalFileIds': True, 'tags': []}
04/27/2023 01:10:26 PM - 

04/27/2023 01:22:37 PM - INFO: Creating full-view snapshot.
04/27/2023 01:22:37 PM - INFO: Submitting snapshot request.
TDR Job ID: SsnndnABTimINf-YlYkSzw
04/27/2023 01:24:08 PM - INFO: Snapshot Creation succeeded: {'id': '328745cc-e527-4780-af6f-30ab69d26702', 'name': 'ANVIL_ccdg_baylor_cvd_hemstroke_washu_ds_wgs_20221128_ANV5_202304271322', 'description': 'Full view snapshot of ANVIL_ccdg_baylor_cvd_hemstroke_washu_ds_wgs_20221128', 'createdDate': '2023-04-27T13:22:56.681489Z', 'profileId': 'e0e03e48-5b96-45ec-baa4-8cc1ebf74c61', 'storage': [{'region': 'us-central1', 'cloudResource': 'bigquery', 'cloudPlatform': 'gcp'}, {'region': 'us-east4', 'cloudResource': 'firestore', 'cloudPlatform': 'gcp'}, {'region': 'us-central1', 'cloudResource': 'bucket', 'cloudPlatform': 'gcp'}], 'secureMonitoringEnabled': False, 'consentCode': 'TBD', 'phsId': '', 'cloudPlatform': 'gcp', 'dataProject': 'datarepo-9c18f51b', 'storageAccount': None, 'selfHosted': True, 'globalFileIds': True, 'tags': []}
04/27

04/27/2023 01:34:16 PM - INFO: Creating full-view snapshot.
04/27/2023 01:34:16 PM - INFO: Submitting snapshot request.
TDR Job ID: 6jl5Ny9xTGucM2y0xDgtIg
04/27/2023 01:35:27 PM - INFO: Snapshot Creation succeeded: {'id': 'f875fdc4-f57d-4a4c-9b22-daf101156d26', 'name': 'ANVIL_ccdg_broad_cvd_af_olesen_wes_20221202_ANV5_202304271334', 'description': 'Full view snapshot of ANVIL_ccdg_broad_cvd_af_olesen_wes_20221202', 'createdDate': '2023-04-27T13:34:35.433251Z', 'profileId': 'e0e03e48-5b96-45ec-baa4-8cc1ebf74c61', 'storage': [{'region': 'us-central1', 'cloudResource': 'bigquery', 'cloudPlatform': 'gcp'}, {'region': 'us-east4', 'cloudResource': 'firestore', 'cloudPlatform': 'gcp'}, {'region': 'us-central1', 'cloudResource': 'bucket', 'cloudPlatform': 'gcp'}], 'secureMonitoringEnabled': False, 'consentCode': 'TBD', 'phsId': '', 'cloudPlatform': 'gcp', 'dataProject': 'datarepo-2fdb307c', 'storageAccount': None, 'selfHosted': True, 'globalFileIds': True, 'tags': []}
04/27/2023 01:35:28 PM - 

04/27/2023 01:46:36 PM - INFO: Creating full-view snapshot.
04/27/2023 01:46:36 PM - INFO: Submitting snapshot request.
TDR Job ID: 3gaxi7WUSRaYAeSQ-4A9yA
04/27/2023 01:48:28 PM - INFO: Snapshot Creation succeeded: {'id': '8956cc4d-58be-46ae-a81e-74607ffbd9d3', 'name': 'ANVIL_eMERGE_PRS_Arrays_20221220_ANV5_202304271346', 'description': 'Full view snapshot of ANVIL_eMERGE_PRS_Arrays_20221220', 'createdDate': '2023-04-27T13:46:55.738710Z', 'profileId': 'e0e03e48-5b96-45ec-baa4-8cc1ebf74c61', 'storage': [{'region': 'us-central1', 'cloudResource': 'bigquery', 'cloudPlatform': 'gcp'}, {'region': 'us-east4', 'cloudResource': 'firestore', 'cloudPlatform': 'gcp'}, {'region': 'us-central1', 'cloudResource': 'bucket', 'cloudPlatform': 'gcp'}], 'secureMonitoringEnabled': False, 'consentCode': 'TBD', 'phsId': '', 'cloudPlatform': 'gcp', 'dataProject': 'datarepo-ce8c469f', 'storageAccount': None, 'selfHosted': True, 'globalFileIds': True, 'tags': []}
04/27/2023 01:48:28 PM - INFO: Creating full-vi

04/27/2023 02:00:09 PM - INFO: Creating full-view snapshot.
04/27/2023 02:00:09 PM - INFO: Submitting snapshot request.
TDR Job ID: iM3_8w5zRCSoWlAbzpawYQ
04/27/2023 02:01:40 PM - INFO: Snapshot Creation succeeded: {'id': '50a37ecf-071a-4f8f-9c72-70280973f9eb', 'name': 'ANVIL_CCDG_Broad_NP_Epilepsy_AUSAUS_EPIL_BA_MDS_GSA_MD_20221117_ANV5_202304271400', 'description': 'Full view snapshot of ANVIL_CCDG_Broad_NP_Epilepsy_AUSAUS_EPIL_BA_MDS_GSA_MD_20221117', 'createdDate': '2023-04-27T14:00:28.212862Z', 'profileId': 'e0e03e48-5b96-45ec-baa4-8cc1ebf74c61', 'storage': [{'region': 'us-central1', 'cloudResource': 'bigquery', 'cloudPlatform': 'gcp'}, {'region': 'us-east4', 'cloudResource': 'firestore', 'cloudPlatform': 'gcp'}, {'region': 'us-central1', 'cloudResource': 'bucket', 'cloudPlatform': 'gcp'}], 'secureMonitoringEnabled': False, 'consentCode': 'TBD', 'phsId': '', 'cloudPlatform': 'gcp', 'dataProject': 'datarepo-70c803d7', 'storageAccount': None, 'selfHosted': True, 'globalFileIds': Tru

04/27/2023 02:12:40 PM - INFO: Creating full-view snapshot.
04/27/2023 02:12:40 PM - INFO: Submitting snapshot request.
TDR Job ID: K1ZPAp8MSGeOaI8JfJuq4A
04/27/2023 02:14:01 PM - INFO: Snapshot Creation succeeded: {'id': 'ae101395-36eb-4d59-9970-6696b82057db', 'name': 'ANVIL_CCDG_Broad_NP_Epilepsy_CZEMTH_GRU_GSA_MD_20221121_ANV5_202304271412', 'description': 'Full view snapshot of ANVIL_CCDG_Broad_NP_Epilepsy_CZEMTH_GRU_GSA_MD_20221121', 'createdDate': '2023-04-27T14:12:59.492963Z', 'profileId': 'e0e03e48-5b96-45ec-baa4-8cc1ebf74c61', 'storage': [{'region': 'us-central1', 'cloudResource': 'bigquery', 'cloudPlatform': 'gcp'}, {'region': 'us-east4', 'cloudResource': 'firestore', 'cloudPlatform': 'gcp'}, {'region': 'us-central1', 'cloudResource': 'bucket', 'cloudPlatform': 'gcp'}], 'secureMonitoringEnabled': False, 'consentCode': 'TBD', 'phsId': '', 'cloudPlatform': 'gcp', 'dataProject': 'datarepo-b2e64670', 'storageAccount': None, 'selfHosted': True, 'globalFileIds': True, 'tags': []}
0

04/27/2023 02:26:22 PM - INFO: Creating full-view snapshot.
04/27/2023 02:26:22 PM - INFO: Submitting snapshot request.
TDR Job ID: IECGcN6xQsmrEFikRzw_2Q
04/27/2023 02:27:53 PM - INFO: Snapshot Creation succeeded: {'id': '3984cfaf-0034-4b7e-ae21-8ae9810a62a1', 'name': 'ANVIL_CCDG_Broad_NP_Epilepsy_FINUVH_HMB_NPU_MDS_GSA_MD_20221128_ANV5_202304271426', 'description': 'Full view snapshot of ANVIL_CCDG_Broad_NP_Epilepsy_FINUVH_HMB_NPU_MDS_GSA_MD_20221128', 'createdDate': '2023-04-27T14:26:42.215461Z', 'profileId': 'e0e03e48-5b96-45ec-baa4-8cc1ebf74c61', 'storage': [{'region': 'us-central1', 'cloudResource': 'bigquery', 'cloudPlatform': 'gcp'}, {'region': 'us-east4', 'cloudResource': 'firestore', 'cloudPlatform': 'gcp'}, {'region': 'us-central1', 'cloudResource': 'bucket', 'cloudPlatform': 'gcp'}], 'secureMonitoringEnabled': False, 'consentCode': 'TBD', 'phsId': '', 'cloudPlatform': 'gcp', 'dataProject': 'datarepo-6ecff823', 'storageAccount': None, 'selfHosted': True, 'globalFileIds': Tru

04/27/2023 02:39:45 PM - INFO: Creating full-view snapshot.
04/27/2023 02:39:45 PM - INFO: Submitting snapshot request.
TDR Job ID: SZorwbpBTGSUgtIdze8hYQ
04/27/2023 02:42:17 PM - INFO: Snapshot Creation succeeded: {'id': '63363aed-e5ea-4ba4-8962-da03369ca536', 'name': 'ANVIL_CCDG_Broad_NP_Epilepsy_IRLRCI_GRU_IRB_GSA_MD_20221129_ANV5_202304271439', 'description': 'Full view snapshot of ANVIL_CCDG_Broad_NP_Epilepsy_IRLRCI_GRU_IRB_GSA_MD_20221129', 'createdDate': '2023-04-27T14:40:59.969359Z', 'profileId': 'e0e03e48-5b96-45ec-baa4-8cc1ebf74c61', 'storage': [{'region': 'us-central1', 'cloudResource': 'bigquery', 'cloudPlatform': 'gcp'}, {'region': 'us-east4', 'cloudResource': 'firestore', 'cloudPlatform': 'gcp'}, {'region': 'us-central1', 'cloudResource': 'bucket', 'cloudPlatform': 'gcp'}], 'secureMonitoringEnabled': False, 'consentCode': 'TBD', 'phsId': '', 'cloudPlatform': 'gcp', 'dataProject': 'datarepo-1a26d8e4', 'storageAccount': None, 'selfHosted': True, 'globalFileIds': True, 'tags

04/27/2023 02:54:29 PM - INFO: Creating full-view snapshot.
04/27/2023 02:54:29 PM - INFO: Submitting snapshot request.
TDR Job ID: ostOr5f3SQ6gH8tIauq4rQ
04/27/2023 02:56:00 PM - INFO: Snapshot Creation succeeded: {'id': 'c3bb5d5c-dad2-4762-ac97-a8d920b414b5', 'name': 'ANVIL_CCDG_Broad_NP_Epilepsy_LEBABM_DS_Epilepsy_GSA_MD_20221130_ANV5_202304271454', 'description': 'Full view snapshot of ANVIL_CCDG_Broad_NP_Epilepsy_LEBABM_DS_Epilepsy_GSA_MD_20221130', 'createdDate': '2023-04-27T14:54:47.916351Z', 'profileId': 'e0e03e48-5b96-45ec-baa4-8cc1ebf74c61', 'storage': [{'region': 'us-central1', 'cloudResource': 'bigquery', 'cloudPlatform': 'gcp'}, {'region': 'us-east4', 'cloudResource': 'firestore', 'cloudPlatform': 'gcp'}, {'region': 'us-central1', 'cloudResource': 'bucket', 'cloudPlatform': 'gcp'}], 'secureMonitoringEnabled': False, 'consentCode': 'TBD', 'phsId': '', 'cloudPlatform': 'gcp', 'dataProject': 'datarepo-de7c951d', 'storageAccount': None, 'selfHosted': True, 'globalFileIds': Tru

04/27/2023 03:08:11 PM - INFO: Creating full-view snapshot.
04/27/2023 03:08:11 PM - INFO: Submitting snapshot request.
TDR Job ID: HW-QmDrLSgKPQ65KThMEpA
04/27/2023 03:10:12 PM - INFO: Snapshot Creation succeeded: {'id': 'c191a23a-926c-4a61-8294-27496a41a4da', 'name': 'ANVIL_CCDG_Broad_NP_Epilepsy_USACCH_DS_NEURO_MDS_GSA_MD_20221130_ANV5_202304271508', 'description': 'Full view snapshot of ANVIL_CCDG_Broad_NP_Epilepsy_USACCH_DS_NEURO_MDS_GSA_MD_20221130', 'createdDate': '2023-04-27T15:08:30.149581Z', 'profileId': 'e0e03e48-5b96-45ec-baa4-8cc1ebf74c61', 'storage': [{'region': 'us-central1', 'cloudResource': 'bigquery', 'cloudPlatform': 'gcp'}, {'region': 'us-east4', 'cloudResource': 'firestore', 'cloudPlatform': 'gcp'}, {'region': 'us-central1', 'cloudResource': 'bucket', 'cloudPlatform': 'gcp'}], 'secureMonitoringEnabled': False, 'consentCode': 'TBD', 'phsId': '', 'cloudPlatform': 'gcp', 'dataProject': 'datarepo-24ec8959', 'storageAccount': None, 'selfHosted': True, 'globalFileIds': T

04/27/2023 03:23:55 PM - INFO: Creating full-view snapshot.
04/27/2023 03:23:55 PM - INFO: Submitting snapshot request.
TDR Job ID: cFRHwr2CS2ydwm4edEMxyA
04/27/2023 03:25:36 PM - INFO: Snapshot Creation succeeded: {'id': '8c634fb0-da0e-403c-8e4a-13cef21411a7', 'name': 'ANVIL_CCDG_Broad_NP_Epilepsy_USAMGH_MGBB_HMB_MDS_GSA_MD_20221130_ANV5_202304271523', 'description': 'Full view snapshot of ANVIL_CCDG_Broad_NP_Epilepsy_USAMGH_MGBB_HMB_MDS_GSA_MD_20221130', 'createdDate': '2023-04-27T15:24:14.164905Z', 'profileId': 'e0e03e48-5b96-45ec-baa4-8cc1ebf74c61', 'storage': [{'region': 'us-central1', 'cloudResource': 'bigquery', 'cloudPlatform': 'gcp'}, {'region': 'us-east4', 'cloudResource': 'firestore', 'cloudPlatform': 'gcp'}, {'region': 'us-central1', 'cloudResource': 'bucket', 'cloudPlatform': 'gcp'}], 'secureMonitoringEnabled': False, 'consentCode': 'TBD', 'phsId': '', 'cloudPlatform': 'gcp', 'dataProject': 'datarepo-8e874f13', 'storageAccount': None, 'selfHosted': True, 'globalFileIds': T

04/27/2023 03:39:18 PM - INFO: Creating full-view snapshot.
04/27/2023 03:39:18 PM - INFO: Submitting snapshot request.
TDR Job ID: te4SyKrSRzSXeNZl4YMhiw
04/27/2023 03:40:52 PM - INFO: Snapshot Creation succeeded: {'id': '5681d110-8c84-478c-9d1f-7935a54b86ca', 'name': 'ANVIL_CCDG_Broad_NP_Epilepsy_USAVAN_HMB_GSO_GSA_MD_20221201_ANV5_202304271539', 'description': 'Full view snapshot of ANVIL_CCDG_Broad_NP_Epilepsy_USAVAN_HMB_GSO_GSA_MD_20221201', 'createdDate': '2023-04-27T15:39:36.981088Z', 'profileId': 'e0e03e48-5b96-45ec-baa4-8cc1ebf74c61', 'storage': [{'region': 'us-central1', 'cloudResource': 'bigquery', 'cloudPlatform': 'gcp'}, {'region': 'us-east4', 'cloudResource': 'firestore', 'cloudPlatform': 'gcp'}, {'region': 'us-central1', 'cloudResource': 'bucket', 'cloudPlatform': 'gcp'}], 'secureMonitoringEnabled': False, 'consentCode': 'TBD', 'phsId': '', 'cloudPlatform': 'gcp', 'dataProject': 'datarepo-a7e81204', 'storageAccount': None, 'selfHosted': True, 'globalFileIds': True, 'tags

04/27/2023 03:56:06 PM - INFO: Creating full-view snapshot.
04/27/2023 03:56:06 PM - INFO: Attempting to lookup consent code using PHS: 1489 and Consent Name: NA.
04/27/2023 03:56:06 PM - INFO: Submitting snapshot request.
TDR Job ID: CIjh1y6nRP6Gfyb_o8hC_Q
04/27/2023 03:57:27 PM - INFO: Snapshot Creation succeeded: {'id': '30851e99-bbd0-48d3-b4f0-e3525b0506ca', 'name': 'ANVIL_CCDG_Broad_NP_Epilepsy_AUSALF_HMB_IRB_WES_20230128_ANV5_202304271556', 'description': 'Full view snapshot of ANVIL_CCDG_Broad_NP_Epilepsy_AUSALF_HMB_IRB_WES_20230128', 'createdDate': '2023-04-27T15:56:25.262425Z', 'profileId': 'e0e03e48-5b96-45ec-baa4-8cc1ebf74c61', 'storage': [{'region': 'us-central1', 'cloudResource': 'bigquery', 'cloudPlatform': 'gcp'}, {'region': 'us-east4', 'cloudResource': 'firestore', 'cloudPlatform': 'gcp'}, {'region': 'us-central1', 'cloudResource': 'bucket', 'cloudPlatform': 'gcp'}], 'secureMonitoringEnabled': False, 'consentCode': 'NA', 'phsId': 'phs001489', 'cloudPlatform': 'gcp', 'da

04/27/2023 04:12:09 PM - INFO: Creating full-view snapshot.
04/27/2023 04:12:09 PM - INFO: Attempting to lookup consent code using PHS: 1616 and Consent Name: GRU.
04/27/2023 04:12:10 PM - INFO: Submitting snapshot request.
TDR Job ID: PREEKswLSI-tQo116y5EBw
04/27/2023 04:13:30 PM - INFO: Snapshot Creation succeeded: {'id': 'e5ccacfe-1b14-4331-bd8f-a542b5a70d23', 'name': 'ANVIL_eMERGE_GRU_eMERGEseq_20230130_ANV5_202304271612', 'description': 'Full view snapshot of ANVIL_eMERGE_GRU_eMERGEseq_20230130', 'createdDate': '2023-04-27T16:12:28.506288Z', 'profileId': 'e0e03e48-5b96-45ec-baa4-8cc1ebf74c61', 'storage': [{'region': 'us-central1', 'cloudResource': 'bigquery', 'cloudPlatform': 'gcp'}, {'region': 'us-east4', 'cloudResource': 'firestore', 'cloudPlatform': 'gcp'}, {'region': 'us-central1', 'cloudResource': 'bucket', 'cloudPlatform': 'gcp'}], 'secureMonitoringEnabled': False, 'consentCode': 'c1', 'phsId': 'phs001616', 'cloudPlatform': 'gcp', 'dataProject': 'datarepo-34f8138d', 'storage

04/27/2023 04:24:20 PM - INFO: Creating full-view snapshot.
04/27/2023 04:24:20 PM - INFO: Attempting to lookup consent code using PHS: 1616 and Consent Name: HMB-NPU.
04/27/2023 04:24:20 PM - INFO: Submitting snapshot request.
TDR Job ID: VMxYTcWkQPGo-LELOy6n4A
04/27/2023 04:25:41 PM - INFO: Snapshot Creation succeeded: {'id': '84cfc3d8-282e-4102-ae43-5513e7a3efd5', 'name': 'ANVIL_eMERGE_HMB_NPU_eMERGEseq_20230130_ANV5_202304271624', 'description': 'Full view snapshot of ANVIL_eMERGE_HMB_NPU_eMERGEseq_20230130', 'createdDate': '2023-04-27T16:24:39.694181Z', 'profileId': 'e0e03e48-5b96-45ec-baa4-8cc1ebf74c61', 'storage': [{'region': 'us-central1', 'cloudResource': 'bigquery', 'cloudPlatform': 'gcp'}, {'region': 'us-east4', 'cloudResource': 'firestore', 'cloudPlatform': 'gcp'}, {'region': 'us-central1', 'cloudResource': 'bucket', 'cloudPlatform': 'gcp'}], 'secureMonitoringEnabled': False, 'consentCode': 'c8', 'phsId': 'phs001616', 'cloudPlatform': 'gcp', 'dataProject': 'datarepo-1ddf2a8

Unnamed: 0,dataset_id,run_status,snapshot_id
0,0c18589c-6432-4a6c-90ce-985a47a66f39,Success,24c427f6-17b9-4cd1-962f-92a12b090d8a
1,3a72e4b8-afb4-4299-98ec-a9ba9606be06,Success,428c8260-1b27-446c-8484-a28341b41dcc
2,a9a5deef-8dc5-44eb-bc51-46bd6bcab2a6,Success,ea4e8c79-b6bd-4b24-990e-624de9d15835
3,ddc9e3fc-3f8b-4d2f-b4c6-4aade37b2fad,Success,ce525190-7d7f-4e57-8176-398cd9b0b7c5
4,8ec3476f-0e76-4b63-970f-e6d5c078a0d2,Success,1bb3d012-1637-4f61-ba1e-a8549a43973e
5,44f83f20-d618-40b5-b2cb-3676b8fe3ad7,Success,e6dfa202-d2a0-407d-be70-84cb53c9f9ec
6,a5fe75bb-d28c-42fb-aaf8-92fa37b266d2,Success,e9fa838e-b173-4262-8fb6-e5eef53856ab
7,6545d602-e5b4-4dd1-8f6a-64e0a1952ddc,Success,636272e4-d4e2-4a25-ba10-e1d1cb9352bb
8,6765ce2d-ebc8-4367-8855-c0f8e62cb355,Success,d0709a13-9701-437d-848f-fbce26b3bf5b
9,278a26cb-a710-4fff-928e-fc2e7084a75a,Success,06216d97-7d1d-4105-bf60-958b71c02cfd


## Verify Snapshots Have Properly Formatted DRS URI

In [6]:
def validate_snapshot_drs_format(snapshot_id):
    
    # Retrieve snapshot information
    api_client = utils.refresh_tdr_api_client()
    snapshots_api = data_repo_client.SnapshotsApi(api_client=api_client)
    try:
        response = snapshots_api.retrieve_snapshot(id=snapshot_id, include=["ACCESS_INFORMATION"]).to_dict()
        bq_project = response["access_information"]["big_query"]["project_id"]
        bq_dataset = response["access_information"]["big_query"]["dataset_name"]
    except Exception as e:
        return "Failure - Issue Retrieving Snapshot Info"
    
    # Determine if field exists for dataset, continue if so, fail otherwise
    client = bigquery.Client()
    query = """SELECT COUNT(file_ref) AS rec_cnt, COUNT(CASE WHEN file_ref LIKE '%drs://drs.anv0:v2_%' THEN file_ref END) AS valid_cnt
                FROM `{project}.{dataset}.anvil_file`""".format(project=bq_project, dataset=bq_dataset)
    try:
        df = client.query(query).result().to_dataframe()
        if df["rec_cnt"].values[0] == df["valid_cnt"].values[0]:
            return "Success"
        else:
            rec_cnt = df["rec_cnt"].values[0]
            valid_cnt = df["valid_cnt"].values[0]
            return f"Failure: Only {valid_cnt} of {rec_cnt} records properly formatted"
    except Exception as e:
        return "Failure - BigQuery Error"

# Loop through datasets and validate is_supplementary field
snapshot_id_list = [
'c3d22305-b3f2-4561-a5b9-bed82ee742f4',
'9fe2abd4-70b4-4eee-b00d-38726ced8620',
'5329c25e-ccad-435d-9250-6fcc3ff88472',
'ced601b2-9a11-40e9-8067-241e5a5996ed',
'8165245c-2003-4ec7-bf57-731959022d47',
'737d454c-88be-477f-ae2c-ef473e2106ce',
'3bdbad9e-f9d4-4442-8606-791d490bf0af',
'cd19195f-25a0-44b1-b47d-ec99141833fc',
'b897e519-ba8b-4758-a263-6d57bd3b8e2b',
'1d385cfc-4bed-4f52-8f7b-ea54fc44b4f7',
'02d25240-823f-4b1d-8562-95385716a453',
'1974a21b-c409-4736-a3d7-e195fa96c4eb',
'99b46287-4790-492c-8a12-bea33f0f927c',
'c6ef5822-3929-4ae7-b5bc-dc27528bf226',
'08d19a7e-b868-4766-9f7e-d879d972cbd7',
'35186e6d-2728-4a8e-b0ad-6b34d0fe480c',
'b0d176bf-d094-4e33-a34b-b83a94de86ea',
'cc6bacc8-29fa-4d97-8856-79f52ea50c6f',
'85b721da-ad8e-4d82-93f0-0988f94af22e',
'407c7800-3ab4-4b13-ba45-c6c13c1c2278',
'2529f127-cff5-43ff-b879-06bc0e3468ff',
'b511be0b-7dc5-4767-a891-37f43d04a5a5',
'a7e031c3-62d4-46db-b2e2-0bdca6bbad65',
'5bba97dc-d6ab-4329-912f-148c8b807056',
'9cf61d88-d096-4981-b0c6-99db77554c01',
'4c722626-c559-4f5a-84bd-8d7d46983e1e',
'7c237e08-3329-4e64-bd2a-063be290e78b',
'4117144f-92e7-454f-9263-dad5e128cadb',
'ce2e7235-26e6-470f-8e05-298193b7f53d',
'6df525e1-b143-4e6f-b667-80c783ae1b66',
'92666b7c-4d50-4530-88e9-ea2d3da9d07a',
'42644c25-fa23-4b4e-8fcc-907cd8dcef60',
'155c11a9-638a-45c8-b172-7cf2e3e16fe6',
'b3da9fec-08ad-4496-a9ac-1411388fb5cc',
'0de07296-e3ff-4fe6-9183-9f421484197c',
'1b6273c6-7769-4daf-abee-93b11b322c73',
'ea50255a-45a4-4846-82e3-02b4f46f5b17',
'eb7045e1-2286-49f1-bce6-21b5d7fa5c32',
'b763c288-4132-434a-a6c9-25ad51b9d961',
'b67702a8-307d-4b20-835e-c0245d0761e5',
'88548251-e59e-4bc3-b71a-f1e9e2369919',
'd3dc5627-503b-48a5-ad79-31ab6c2fd417',
'ec14f8cd-5b1b-4124-a235-f11159984c7c',
'6d9e1212-4fa6-4632-be8a-75c45a474dd3',
'667eac9b-4e90-413d-80f3-d857b9829ab7',
'c091ea30-1862-4b1f-8e92-087b441472c3',
'43c86818-9bfe-46f2-9ae4-4a55a7baef1f',
'ebdaca04-ef29-42f3-8486-a94dade81bf8',
'f8781fbf-5fef-4481-8819-3df1bc724b7f',
'830df9ed-e4a6-4c9a-a97a-aa080fb030e4',
'84703c54-a9dd-400c-9701-2fc40922e3e3',
'c1c674dd-056a-470c-8874-bf70d8fae3a8',
'6a5b3be6-d1de-4f23-a431-b08e7ab231b8',
'ffe34538-3ddd-48de-b4a2-94f9b2dad086',
'2c6de04e-104d-42c8-8448-97d74985dacb',
'2a1882d9-88ca-4849-bcc1-f6914f593407',
'bf2f4106-cee9-419c-b4d1-d7b03a6293d5',
'a6c36f5e-b86c-4164-85ae-8bf0df2e4a90',
'7c19d852-e36a-4353-afea-10e501601d9a',
'00297802-e20a-413f-b389-a6f764b6600e',
'b8a455eb-827d-43a0-a89b-5d017747140f',
'3e85b06a-a6ea-4ce8-a655-44b1fce12138',
'9321b908-f2e4-437b-b53e-ed81754dcace',
'172bada7-f1c5-41c4-836d-05381beaed9a',
'133e902c-5ff0-4119-8078-db3e15006844',
'452bcafd-ab45-4e24-b5e0-13fcf22b0755',
'5e547934-c339-410e-a013-dfefed50f4b8',
'ffa84feb-ca0e-43d3-a04d-a402a8e24a3b',
'ff27037e-cb52-44ef-8979-f6e7ac3ed6f6',
'c853d4c0-d4be-433d-964e-e30bdc35480e',
'8fbe2def-b8ad-4b2d-90c9-0dd4517c67e1',
'03e54581-8fd3-47c3-9143-55368d2e4e86',
'9efae3c7-904c-48a8-939a-e82b46005ae1',
'5955a235-5be6-47bc-8303-ed0c4e68f501',
'e04edfef-69f8-47ff-8df9-dfff0e9218d2',
'f2a7be5a-4f7a-4a96-935e-ca7592855b45',
'7c90289b-be3e-4c9b-917a-d5e27d95dc15',
'0f46a588-b4ff-4a69-99e9-0a0bcf052522',
'cdd689fd-10f3-4cfa-b738-46549e689cac',
'eb7948be-1007-4b0e-b9b6-a5c40bbb9596',
'f20753f0-d09a-4b47-bffe-8f24ec354761',
'4cff04f4-eff9-4a62-bc6e-691accfbd328',
'9a61b980-4a33-465a-bc50-1aba00bc2cf6',
'90fe2016-e79c-456c-a5f9-3a31149fcd65',
'a4c62d7f-34f0-4e2e-9e46-c762d3ab0ff2',
'28dc8121-5e55-46c2-8313-681de2298986',
'dcc578ed-44bb-458f-8ff5-a78ca83f4616',
'aa42debe-3747-4dcd-8bc9-24eb90673fa5',
'5208772d-21f9-46b0-8167-0b05b57296b8',
'a2da748b-fec8-4e10-88ee-de32cbe8dee1',
'26df2a34-b10d-4361-ba2b-d9f966d09f61',
'dd00a8ba-ac49-481b-8d79-0e440adafd77',
'0df983d7-ed5e-44d2-acf1-686822b0cc7e',
'28559e94-ed57-48c8-bc8b-6cc4ad659a61',
'8b385bd3-52aa-48b9-be33-41f4d3fd4531',
'ce1bf5c3-525e-455d-a1e9-dd5f3d68c9d3',
'd0a6aa4c-821c-4bba-b53b-4f230ca3cda4',
'd9e817a2-6657-433b-8b2f-73790561725c',
'33c854eb-d228-4a82-8324-5e455ed1e447',
]
results = []
for snapshot_id in snapshot_id_list:
    status = validate_snapshot_drs_format(snapshot_id) 
    results.append([snapshot_id, status])
    results_df = pd.DataFrame(results, columns = ["snapshot_id", "validation_status"])
display(results_df)

Unnamed: 0,snapshot_id,validation_status
0,c3d22305-b3f2-4561-a5b9-bed82ee742f4,Success
1,9fe2abd4-70b4-4eee-b00d-38726ced8620,Success
2,5329c25e-ccad-435d-9250-6fcc3ff88472,Success
3,ced601b2-9a11-40e9-8067-241e5a5996ed,Success
4,8165245c-2003-4ec7-bf57-731959022d47,Success
5,737d454c-88be-477f-ae2c-ef473e2106ce,Success
6,3bdbad9e-f9d4-4442-8606-791d490bf0af,Success
7,cd19195f-25a0-44b1-b47d-ec99141833fc,Success
8,b897e519-ba8b-4758-a263-6d57bd3b8e2b,Success
9,1d385cfc-4bed-4f52-8f7b-ea54fc44b4f7,Success


# Add and populate anvil_file.is_supplementary

## Script to patch dataset

In [None]:
def add_and_populate_supp_file_flg(dataset_id):
    logging.info(f"Processing anvil_file.is_supplementary for Dataset ID = {dataset_id}")
    
    # Retrieve dataset information
    logging.info("Retrieving necessary information from TDR.")
    src_schema_dict = {}
    api_client = utils.refresh_tdr_api_client()
    datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
    try:
        response = datasets_api.retrieve_dataset(id=dataset_id, include=["SCHEMA", "ACCESS_INFORMATION"]).to_dict()
        src_schema_dict["tables"] = response["schema"]["tables"]
        bq_project = response["access_information"]["big_query"]["project_id"]
        bq_dataset = response["access_information"]["big_query"]["dataset_name"]
    except Exception as e:
        logging.error("Error retrieving information from TDR. Exiting function. Error: {}".format(e))
        return "Failure"
    
    # Determine if field needs to be added, and add if so
    logging.info("Adding anvil_file.is_supplementary to dataset schema, if necessary.")
    field_found = False
    for table in src_schema_dict["tables"]:
        if table["name"] == "anvil_file":
            for col in table["columns"]:
                if col["name"] == "is_supplementary":
                    field_found = True
                    logging.info("Field already found! Skipping schema update.")
                    break
            break
    if field_found == False:
        logging.info("Field not found. Running dataset schema update.")
        schema_update_request = {
            "description": "Adding is_supplementary column to anvil_file",
            "changes": {
                "addColumns": [
                  {
                    "tableName": "anvil_file",
                    "columns": [
                      {
                        "name": "is_supplementary",
                        "datatype": "boolean",
                        "array_of": False,
                        "required": False
                      }
                    ]
                  }
                ]
            }
        }
        attempt_counter = 0
        while True:
            try:
                schema_update_result, job_id = utils.wait_for_tdr_job(datasets_api.update_schema(id=dataset_id, dataset_schema_update_model=schema_update_request))
                logging.info("Dataset schema update succeeded!")
                break
            except Exception as e:
                logging.error("Error on dataset schema update: {}".format(str(e)))
                attempt_counter += 1
                if attempt_counter < 2:
                    logging.info("Retrying dataset schema update (attempt #{})...".format(str(attempt_counter)))
                    sleep(15)
                    continue
                else:
                    logging.error("Maximum number of retries exceeded. Unable to update dataset schema. Exiting function.")
                    return "Failure"
        
    # Re-process anvil_file data to include is_supplementary (where appropriate) and ingest into TDR dataset (as replace)
    logging.info("Re-processing existing anvil_file data to include is_supplementary value.")
    client = bigquery.Client()
    target_file = "anvil_file.json"
    destination_dir = "ingest_pipeline/output/transformed/anvil/{}/table_data".format(dataset_id)
    query = """BEGIN
        
        CREATE TEMPORARY TABLE activity_exp AS WITH activity_agg
        AS
        (
          SELECT used_biosample_id, generated_file_id, used_file_id FROM `{project}.{dataset}.anvil_activity`
          UNION ALL 
          SELECT [] AS used_biosample_id, generated_file_id, used_file_id FROM `{project}.{dataset}.anvil_alignmentactivity`
          UNION ALL 
          SELECT used_biosample_id, generated_file_id, [] AS used_file_id FROM `{project}.{dataset}.anvil_assayactivity`
          UNION ALL 
          SELECT used_biosample_id, generated_file_id, [] AS used_file_id FROM `{project}.{dataset}.anvil_sequencingactivity`
          UNION ALL 
          SELECT [] AS used_biosample_id, generated_file_id, used_file_id FROM `{project}.{dataset}.anvil_variantcallingactivity`
        )
        SELECT file_id, int_file_id, biosample_id
        FROM activity_agg
            LEFT JOIN UNNEST(used_biosample_id) AS biosample_id
            LEFT JOIN UNNEST(generated_file_id) as file_id
            LEFT JOIN UNNEST(used_file_id) as int_file_id
        ;
        
        CREATE TEMPORARY TABLE act_exp_lookup
        AS
        (
            SELECT file_id, MAX(biosample_id) AS biosample_id
          FROM
          (
            --Level 1:
            SELECT file_id, biosample_id
            FROM activity_exp
            WHERE int_file_id IS NULL AND file_id IS NOT NULl AND biosample_id IS NOT NULL
            --Level 2:
            UNION ALL
            SELECT a2.file_id, a1.biosample_id
            FROM activity_exp a1
              LEFT JOIN activity_exp a2
              ON a1.file_id = a2.int_file_id
            WHERE a2.int_file_id IS NOT NULL AND a2.file_id IS NOT NULL AND a1.biosample_id IS NOT NULL
            --Level 3:
            UNION ALL
            SELECT a3.file_id, a1.biosample_id
            FROM activity_exp a1
              LEFT JOIN activity_exp a2
              ON a1.file_id = a2.int_file_id
              LEFT JOIN activity_exp a3
              ON a2.file_id = a3.int_file_id
            WHERE a3.int_file_id IS NOT NULL AND a3.file_id IS NOT NULL AND a1.biosample_id IS NOT NULL
            --Level 4:
            UNION ALL
            SELECT a4.file_id, a1.biosample_id
            FROM activity_exp a1
              LEFT JOIN activity_exp a2
              ON a1.file_id = a2.int_file_id
              LEFT JOIN activity_exp a3
              ON a2.file_id = a3.int_file_id
              LEFT JOIN activity_exp a4
              ON a3.file_id = a4.int_file_id
            WHERE a4.int_file_id IS NOT NULL AND a4.file_id IS NOT NULL AND a1.biosample_id IS NOT NULL
            --Level 5:
            UNION ALL
            SELECT a5.file_id, a1.biosample_id
            FROM activity_exp a1
              LEFT JOIN activity_exp a2
              ON a1.file_id = a2.int_file_id
              LEFT JOIN activity_exp a3
              ON a2.file_id = a3.int_file_id
              LEFT JOIN activity_exp a4
              ON a3.file_id = a4.int_file_id
              LEFT JOIN activity_exp a5
              ON a4.file_id = a5.int_file_id
            WHERE a5.int_file_id IS NOT NULL AND a5.file_id IS NOT NULL AND a1.biosample_id IS NOT NULL
          )
          GROUP BY file_id
        );
        
        SELECT t1.file_id, data_modality, file_format, file_size, file_md5sum, reference_assembly, file_name, file_ref, source_datarepo_row_ids,
        CASE WHEN t2.biosample_id IS NULL THEN TRUE ELSE FALSE END AS is_supplementary
        FROM `{project}.{dataset}.anvil_file` t1
          LEFT JOIN act_exp_lookup t2
          ON t1.file_id = t2.file_id
        ;
        
        END
        """.format(project=bq_project, dataset=bq_dataset)
    try:
        df = client.query(query).result().to_dataframe()
        records_json = df.to_json(orient='records') 
        records_list = json.loads(records_json)
        records_cnt = len(records_list)
        with open(target_file, 'w') as outfile:
            for idx, val in enumerate(records_list):
                json.dump(val, outfile)
                if idx < (records_cnt - 1):
                    outfile.write('\n')
        !gsutil cp $target_file $ws_bucket/$destination_dir/ 2> stdout
        !rm $target_file
        logging.info("Successfully created new anvil_file.json file.")
    except Exception as e:
        logging.error("Error creating new anvil_file.json file. Exiting function. Error: {}".format(str(e)))
        return "Failure"

    # Ingest updated anvil_file data
    logging.info("Ingesting updated anvil_file data into TDR dataset.")
    source_full_file_path = "{}/{}/{}".format(ws_bucket, destination_dir, "anvil_file.json")
    ingest_request = {
        "table": "anvil_file",
        "profile_id": "e0e03e48-5b96-45ec-baa4-8cc1ebf74c61",
        "ignore_unknown_values": True,
        "resolve_existing_files": True,
        "updateStrategy": "replace",
        "format": "json",
        "load_tag": "Ingest for {}".format(dataset_id),
        "path": source_full_file_path
    }
    attempt_counter = 0
    while True:
        try:
            api_client = utils.refresh_tdr_api_client()
            datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
            ingest_request_result, job_id = utils.wait_for_tdr_job(datasets_api.ingest_dataset(id=dataset_id, ingest=ingest_request))
            logging.info("Ingest from file anvil_file.json succeeded: {}".format(str(ingest_request_result)[0:1000]))
            break
        except Exception as e:
            logging.error("Error on Dataset Ingest: {}".format(str(e)))
            attempt_counter += 1
            if attempt_counter < 2:
                logging.info("Retrying Dataset Ingest (attempt #{})...".format(str(attempt_counter)))
                sleep(10)
                continue
            else:
                logging.error("Maximum number of retries exceeded. Exiting function.")
                return "Failure"
    
    # Return success message if no failures recorded
    logging.info("Function completed successfully.")
    return "Success"

# Loop through datasets and process is_supplementary field
dataset_id_list = [
'd74b26d5-24bb-4696-84c3-bcd1f5f90b08',
]
results = []
for dataset_id in dataset_id_list:
    status = add_and_populate_supp_file_flg(dataset_id) 
    results.append([dataset_id, status])
    results_df = pd.DataFrame(results, columns = ["dataset_id", "run_status"])
display(results_df)
    

## Script to validate patch worked properly

In [None]:
def validate_supp_file_flg(dataset_id):
    
    # Retrieve dataset information
    src_schema_dict = {}
    api_client = utils.refresh_tdr_api_client()
    datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
    try:
        response = datasets_api.retrieve_dataset(id=dataset_id, include=["SCHEMA", "ACCESS_INFORMATION"]).to_dict()
        src_schema_dict["tables"] = response["schema"]["tables"]
        bq_project = response["access_information"]["big_query"]["project_id"]
        bq_dataset = response["access_information"]["big_query"]["dataset_name"]
    except Exception as e:
        return "Failure - Issue Retrieving Dataset Info"
    
    # Determine if field exists for dataset, continue if so, fail otherwise
    field_found = False
    for table in src_schema_dict["tables"]:
        if table["name"] == "anvil_file":
            for col in table["columns"]:
                if col["name"] == "is_supplementary":
                    field_found = True
                    break
            break
    if field_found == False:
        return "Failure - is_supplementary field not found"
    else:
        client = bigquery.Client()
        query = """SELECT COUNT(*) AS rec_cnt, COUNT(is_supplementary) AS populated_cnt
                    FROM `{project}.{dataset}.anvil_file`""".format(project=bq_project, dataset=bq_dataset)
        try:
            df = client.query(query).result().to_dataframe()
            if df["rec_cnt"].values[0] == df["populated_cnt"].values[0]:
                return "Success"
        except Exception as e:
            return "Failure - BigQuery Error"

# Loop through datasets and validate is_supplementary field
dataset_id_list = [
'd74b26d5-24bb-4696-84c3-bcd1f5f90b08',
]
results = []
for dataset_id in dataset_id_list:
    status = validate_supp_file_flg(dataset_id) 
    results.append([dataset_id, status])
    results_df = pd.DataFrame(results, columns = ["dataset_id", "validation_status"])
display(results_df)

# Attempt to populate anvil_donor.organism_type

## Script to patch dataset

In [None]:
def populate_organism_type(dataset_id):
    logging.info(f"Processing anvil_donor.organism_type for Dataset ID = {dataset_id}")
    
    # Retrieve dataset information
    logging.info("Retrieving necessary information from TDR.")
    api_client = utils.refresh_tdr_api_client()
    datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
    try:
        response = datasets_api.retrieve_dataset(id=dataset_id, include=["ACCESS_INFORMATION"]).to_dict()
        bq_project = response["access_information"]["big_query"]["project_id"]
        bq_dataset = response["access_information"]["big_query"]["dataset_name"]
    except Exception as e:
        logging.error("Error retrieving information from TDR. Exiting function. Error: {}".format(e))
        return "Failure"

    # Re-process anvil_donor data to include organism_type (where available)
    logging.info("Re-processing existing anvil_donor data to include organism_type value.")
    client = bigquery.Client()
    target_file = "anvil_donor.json"
    destination_dir = "ingest_pipeline/output/transformed/anvil/{}/table_data".format(dataset_id)
    query = """SELECT donor_id, 
    (SELECT MAX(CASE WHEN REGEXP_CONTAINS(value, '(h37|h38|h39|hg16|hg17|hg18|hg19|hs37|hs38|b37)') THEN 'Homo sapiens' END) AS organism_type FROM `{project}.{dataset}.workspace_attributes` WHERE attribute = 'library:reference') AS organism_type,
    part_of_dataset_id, phenotypic_sex, reported_ethnicity, genetic_ancestry, source_datarepo_row_ids
    FROM `{project}.{dataset}.anvil_donor`""".format(project=bq_project, dataset=bq_dataset)
    try:
        df = client.query(query).result().to_dataframe()
        records_json = df.to_json(orient='records') 
        records_list = json.loads(records_json)
        records_cnt = len(records_list)
        with open(target_file, 'w') as outfile:
            for idx, val in enumerate(records_list):
                json.dump(val, outfile)
                if idx < (records_cnt - 1):
                    outfile.write('\n')
        !gsutil cp $target_file $ws_bucket/$destination_dir/ 2> stdout
        !rm $target_file
        logging.info("Successfully created new anvil_donor.json file.")
    except Exception as e:
        logging.error("Error creating new anvil_donor.json file. Exiting function. Error: {}".format(str(e)))
        return "Failure"

    # Ingest updated anvil_donor data
    logging.info("Ingesting updated anvil_donor data into TDR dataset.")
    source_full_file_path = "{}/{}/{}".format(ws_bucket, destination_dir, "anvil_donor.json")
    ingest_request = {
        "table": "anvil_donor",
        "profile_id": "e0e03e48-5b96-45ec-baa4-8cc1ebf74c61",
        "ignore_unknown_values": True,
        "resolve_existing_files": True,
        "updateStrategy": "replace",
        "format": "json",
        "load_tag": "Ingest for {}".format(dataset_id),
        "path": source_full_file_path
    }
    attempt_counter = 0
    while True:
        try:
            api_client = utils.refresh_tdr_api_client()
            datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
            ingest_request_result, job_id = utils.wait_for_tdr_job(datasets_api.ingest_dataset(id=dataset_id, ingest=ingest_request))
            logging.info("Ingest from file anvil_donor.json succeeded: {}".format(str(ingest_request_result)[0:1000]))
            break
        except Exception as e:
            logging.error("Error on Dataset Ingest: {}".format(str(e)))
            attempt_counter += 1
            if attempt_counter < 2:
                logging.info("Retrying Dataset Ingest (attempt #{})...".format(str(attempt_counter)))
                sleep(10)
                continue
            else:
                logging.error("Maximum number of retries exceeded. Exiting function.")
                return "Failure"

    # Return success message if no failures recorded
    logging.info("Function completed successfully.")
    return "Success"

# Loop through datasets and process supplementary_file_flag
dataset_id_list = [
'd74b26d5-24bb-4696-84c3-bcd1f5f90b08',
]
results = []
for dataset_id in dataset_id_list:
    status = populate_organism_type(dataset_id) 
    results.append([dataset_id, status])
    results_df = pd.DataFrame(results, columns = ["dataset_id", "run_status"])
display(results_df)


## Script to examine organism_type population

In [None]:
def validate_organism_type(dataset_id):
    
    # Retrieve dataset information
    src_schema_dict = {}
    api_client = utils.refresh_tdr_api_client()
    datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
    try:
        response = datasets_api.retrieve_dataset(id=dataset_id, include=["SCHEMA", "ACCESS_INFORMATION"]).to_dict()
        src_schema_dict["tables"] = response["schema"]["tables"]
        bq_project = response["access_information"]["big_query"]["project_id"]
        bq_dataset = response["access_information"]["big_query"]["dataset_name"]
    except Exception as e:
        return "Failure - Issue Retrieving Dataset Info"
    
    # Determine if field exists for dataset, continue if so, fail otherwise
    client = bigquery.Client()
    query = """SELECT COUNT(organism_type) AS populated_cnt
                FROM `{project}.{dataset}.anvil_donor`""".format(project=bq_project, dataset=bq_dataset)
    try:
        df = client.query(query).result().to_dataframe()
        if df["populated_cnt"].values[0] > 0:
            return "Success - Field Populated"
        else:
            return "Success - Field Not Populated"
    except Exception as e:
        return "Failure - BigQuery Error"

# Loop through datasets and validate is_supplementary field
dataset_id_list = [
'd74b26d5-24bb-4696-84c3-bcd1f5f90b08',
]
results = []
for dataset_id in dataset_id_list:
    status = validate_organism_type(dataset_id) 
    results.append([dataset_id, status])
    results_df = pd.DataFrame(results, columns = ["dataset_id", "validation_status"])
display(results_df)

# Update references to md5-added files

In [2]:
# Function to collect all datarepo rows for a particular table within a dataset
def collect_all_datarepo_rows(dataset_id, table_name):
    api_client = utils.refresh_tdr_api_client()
    datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
    try:
        response = datasets_api.retrieve_dataset(id=dataset_id, include=["ACCESS_INFORMATION"]).to_dict()
        bq_project = response["access_information"]["big_query"]["project_id"]
        bq_schema = response["access_information"]["big_query"]["dataset_name"]
    except Exception as e:
        logging.error("Error retrieving BQ project and schema: {}".format(str(e)))
    client = bigquery.Client()
    query = "SELECT datarepo_row_id FROM `{project}.{schema}.{table}`".format(project = bq_project, schema = bq_schema, table = table_name)
    try:
        query_job = client.query(query)
        results = [row["datarepo_row_id"] for row in query_job]
        return results
    except Exception as e:
        logging.error("Error retrieving datarepo_row_id list: {}".format(str(e)))
        raise Exception(e)

# Function to delete rows from a dataset
def delete_old_records(dataset_id, table, datarepo_row_ids):
    logging.info(f"Attempting to delete original {table} records.")
    if datarepo_row_ids:
        data_deletion_payload = {
            "deleteType": "soft",
            "specType": "jsonArray",
            "tables": [{
              "tableName": table,
              "jsonArraySpec": {
                "rowIds": datarepo_row_ids
              }
            }]
        }
        api_client = utils.refresh_tdr_api_client()
        datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
        try:
            data_deletion_result, job_id = utils.wait_for_tdr_job(datasets_api.apply_dataset_data_deletion(id=dataset_id, data_deletion_request=data_deletion_payload))
            logging.info("Result: {}".format(data_deletion_result))
        except Exception as e:
            logging.info("Error: {}".format(str(e)))
            raise Exception(e)
    else:
        logging.info("No datarepo_row_ids specified for deletion.")

def ingest_updated_records(profile_id, dataset_id, table, records_dict):
    logging.info(f"Submitting ingest for updated {table} records.")
    
    # Build, submit, and monitor ingest request
    ingest_request = {
        "table": table,
        "profile_id": profile_id,
        "ignore_unknown_values": True,
        "resolve_existing_files": True,
        "updateStrategy": "replace",
        "format": "array",
        "bulkMode": False,
        "load_tag": f"File ref fields patch for {table} in {dataset_id}",
        "records": records_dict
    }
    attempt_counter = 0
    while True:
        try:
            api_client = utils.refresh_tdr_api_client()
            datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
            ingest_request_result, job_id = utils.wait_for_tdr_job(datasets_api.ingest_dataset(id=dataset_id, ingest=ingest_request))
            logging.info("Ingest succeeded: {}".format(str(ingest_request_result)[0:1000]))
            status = "Success"
            return
        except Exception as e:
            logging.error("Error on ingest: {}".format(str(e)))
            attempt_counter += 1
            if attempt_counter < 1:
                logging.info("Retrying ingest (attempt #{})...".format(str(attempt_counter)))
                sleep(10)
                continue
            else:
                logging.error("Maximum number of retries exceeded. Logging error.")
                status = "Error"
                raise Exception(e)
                
def update_recs_w_file_refs(dataset_id):
    logging.info(f"Processing md5-added files for Dataset ID = {dataset_id}")

    ## Retrieve dataset information
    logging.info("Retrieving necessary information from TDR.")
    src_schema_dict = {}
    api_client = utils.refresh_tdr_api_client()
    datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
    try:
        response = datasets_api.retrieve_dataset(id=dataset_id, include=["SCHEMA", "ACCESS_INFORMATION"]).to_dict()
        src_schema_dict["tables"] = response["schema"]["tables"]
        bq_project = response["access_information"]["big_query"]["project_id"]
        bq_dataset = response["access_information"]["big_query"]["dataset_name"]
    except Exception as e:
        logging.error("Error retrieving information from TDR. Exiting function. Error: {}".format(e))
        return "Failure - Pre-processing"

    ## Parse TDR schema to identify file reference fields
    table_dict = {}
    for table in src_schema_dict["tables"]:
        if table["name"] in ["file_inventory", "anvil_file"]:
            continue
        else:
            col_list = []
            for column in table["columns"]:
                if column["datatype"] == "fileref":
                    col_list.append([column["name"], column["array_of"]])
            if col_list:
                table_dict[table["name"]] = col_list

    ## Loop through tables and re-process impacted records
    for table in table_dict.keys():
        logging.info(f"Processing updates for {table}.")
        # Retrieve relevant records from BigQuery
        col_list = []
        old_cols = ""
        new_cols = ""
        join_clause = ""
        where_clause = ""
        for idx, col in enumerate(table_dict[table]):
            column_name = col[0]
            col_list.append(column_name)
            if idx == 0: 
                old_cols += column_name
                where_clause += f"t.{column_name} IN (SELECT file_ref FROM file_list)"
            else:
                old_cols += ", " + column_name
                where_clause += f" OR t.{column_name} IN (SELECT file_ref FROM file_list)"
            new_cols += f", CASE WHEN t{idx}.source_name IS NOT NULL THEN TO_JSON(STRUCT(t{idx}.source_name AS sourcePath, t{idx}.target_path AS targetPath)) END AS {column_name}"
            join_clause += f" LEFT JOIN load_hist t{idx} ON t.{column_name} = t{idx}.file_id"

        query = """WITH 
            file_list AS (SELECT * FROM `{project}.{dataset}.file_inventory` WHERE md5_hash IS NULL),
            load_hist AS (SELECT * FROM `{project}.{dataset}.datarepo_load_history` WHERE state = 'succeeded')
            SELECT t.* EXCEPT({old_cols}){new_cols}
            FROM `{project}.{dataset}.{table}` t {joins} WHERE {where}""".format(project=bq_project, dataset=bq_dataset, table=table, old_cols=old_cols, new_cols=new_cols, joins=join_clause, where=where_clause)
        try:
            client = bigquery.Client()
            res = client.query(query).result()
            if res.total_rows > 0:
                logging.info(f"{res.total_rows} records to process.")
                df = res.to_dataframe()
                records_json = df.to_json(orient='records')
                records_list = json.loads(records_json)
            else:
                logging.info("No records to process.")
                records_list = []
        except Exception as e:
            logging.error(f"Error retrieving update records from BigQuery: {str(e)}")
            return "Failure - Table Processing"
        # Ingest updated records back to TDR dataset
        try:
            datarepo_row_ids = []
            for record in records_list:
                datarepo_row_ids.append(record.pop("datarepo_row_id", None))
                for col in col_list:
                    record[col] = json.loads(record[col])
            if records_list:
                ingest_updated_records("e0e03e48-5b96-45ec-baa4-8cc1ebf74c61", dataset_id, table, records_list)
                delete_old_records(dataset_id, table, datarepo_row_ids)
        except Exception as e:
            logging.error(f"Error replacing TDR records: {str(e)}")
            return "Failure - Table Processing"
        
    ## Re-process file_inventory
    logging.info(f"Processing updates for file_inventory.")
    # Retrieve relevant records from BigQuery
    query = """WITH 
        file_list AS (SELECT file_ref FROM `{project}.{dataset}.file_inventory` WHERE md5_hash IS NULL),
        load_hist AS (SELECT * FROM `{project}.{dataset}.datarepo_load_history` WHERE state = 'succeeded')
        SELECT t1.*, CASE WHEN t2.source_name IS NOT NULL THEN TO_JSON(STRUCT(t2.source_name AS sourcePath, t2.target_path AS targetPath)) END AS file_ref
        FROM `{project}.{dataset}.file_inventory` t1
          INNER JOIN load_hist t2 ON t1.file_ref = t2.file_id
        WHERE file_ref IN (SELECT file_ref FROM file_list)""".format(project=bq_project, dataset=bq_dataset)
    try:
        client = bigquery.Client()
        res = client.query(query).result()
        if res.total_rows > 0:
            logging.info(f"{res.total_rows} records to process.")
            df = res.to_dataframe()
            records_json = df.to_json(orient='records')
            records_list = json.loads(records_json)
        else:
            logging.info("No records to process.")
            records_list = []
    except Exception as e:
        logging.error(f"Error retrieving update records from BigQuery: {str(e)}")
        return "Failure - File Inventory Processing"
    # Loop through records and update md5_hash from GCS metadata
    try:
        storage_client = storage.Client()
        datarepo_row_ids = []
        for record in records_list:
            bucket = re.match('gs:\/\/([a-z0-9\-]+)', record["uri"]).group(1)
            obj = re.match('gs:\/\/[a-z0-9\-]+\/([A-Za-z0-9\-_\/\.]+)', record["uri"]).group(1)
            bucket = storage_client.bucket(bucket, user_project="anvil-datastorage")
            blob = bucket.get_blob(obj)
            record["md5_hash"] = blob.md5_hash
            datarepo_row_ids.append(record.pop("datarepo_row_id", None))
    except Exception as e:
        logging.error(f"Error retrieving file metadata from GCS: {str(e)}")
        return "Failure - File Inventory Processing"
    # Ingest updated records back to TDR dataset
    try:
        if records_list:
            ingest_updated_records("e0e03e48-5b96-45ec-baa4-8cc1ebf74c61", dataset_id, "file_inventory", records_list)
            delete_old_records(dataset_id, "file_inventory", datarepo_row_ids)         
    except Exception as e:
        logging.error(f"Error replacing TDR records: {str(e)}")
        return "Failure - File Inventory Processing"

    ## Empty anvil_% tables
    logging.info("Clearing out existing anvil_% tables")
    table_list = ["anvil_activity", "anvil_alignmentactivity", "anvil_antibody", "anvil_assayactivity", "anvil_biosample", "anvil_diagnosis", "anvil_donor", "anvil_file", "anvil_sequencingactivity", "anvil_variantcallingactivity"]
    for table in table_list:
        try:
            datarepo_row_ids = collect_all_datarepo_rows(dataset_id, table)
            if datarepo_row_ids:
                delete_old_records(dataset_id, table, datarepo_row_ids)
        except Exception as e:
            logging.error(f"Error clearing out existing anvil_% records: {str(e)}")
            return "Failure - anvil_% Record Deletion"
    
    ## Re-run T pipeline without validation
    params = {}
    params["ws_name"] = ws_name
    params["ws_project"] = ws_project
    params["ws_bucket"] = ws_bucket
    params["ws_bucket_name"] = ws_bucket_name
    params["profile_id"] = "e0e03e48-5b96-45ec-baa4-8cc1ebf74c61" 
    params["mapping_target"] = "anvil"
    params["skip_transforms"] = False
    params["transform_list_override"] = [] # Leave empty to run transforms for all files, otherwise populate with target table names 
    params["skip_schema_extension"] = False
    params["skip_ingests"] = False
    params["ingest_list_override"] = [] # Leave empty to run ingests for all files, otherwise populate with target table names
    params["skip_file_relation_inference"] = False
    params["skip_dangling_fk_resolution"] = False
    params["skip_supplementary_file_identification"] = False
    params["skip_snapshot_creation"] = False
    params["snapshot_readers_list"] = ["azul-anvil-prod@firecloud.org"] # Include "auth-domain" to add the auth domain(s) as a reader (if one exists)
    params["skip_data_validation"] = True
    try:
        api_client = utils.refresh_tdr_api_client()
        datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
        dataset_info = datasets_api.retrieve_dataset(id=dataset_id, include=["SCHEMA", "ACCESS_INFORMATION", "PROPERTIES"]).to_dict()
        dataset_name = dataset_info["name"]
        phs_id = dataset_info["phs_id"]
        consent_name = dataset_info["properties"]["consent_name"]
        auth_domains = dataset_info["properties"]["auth_domains"]
        src_workspaces = dataset_info["properties"]["source_workspaces"]
    except:
        dataset_name = ""
        return "Failure - Dataset Retrieval for T Pipeline"
    if dataset_name:
        params["dataset_id"] = dataset_id
        params["dataset_name"] = dataset_name
        params["phs_id"] = phs_id
        params["consent_name"] = consent_name
        params["auth_domains"] = auth_domains
        utils.run_t_pipeline(params)
    
    # Return success message if no failures recorded
    logging.info("Function completed successfully.")
    return "Success"

# Loop through datasets and process md5 updates
dataset_id_list = [
'700303c2-fcef-48a5-9900-096bf34e2d83',
'a715c70d-da92-43ee-a851-1a27277909a2',
]
results = []
for dataset_id in dataset_id_list:
    status = update_recs_w_file_refs(dataset_id) 
    results.append([dataset_id, status])
    results_df = pd.DataFrame(results, columns = ["dataset_id", "run_status"])
display(results_df)


03/31/2023 01:41:51 PM - INFO: Processing md5-added files for Dataset ID = 700303c2-fcef-48a5-9900-096bf34e2d83
03/31/2023 01:41:51 PM - INFO: Retrieving necessary information from TDR.
03/31/2023 01:41:51 PM - INFO: Processing updates for sample.
03/31/2023 01:41:54 PM - INFO: No records to process.
03/31/2023 01:41:54 PM - INFO: Processing updates for file_inventory.
03/31/2023 01:41:56 PM - INFO: 1 records to process.
03/31/2023 01:41:57 PM - INFO: Submitting ingest for updated file_inventory records.
TDR Job ID: Qf0n906NT1OCnyEMLf0mtg
03/31/2023 01:42:28 PM - INFO: Ingest succeeded: {'dataset_id': '700303c2-fcef-48a5-9900-096bf34e2d83', 'dataset': 'ANVIL_CCDG_Broad_MI_BRAVE_GRU_WES_20221107', 'table': 'file_inventory', 'path': None, 'load_tag': 'File ref fields patch for file_inventory in 700303c2-fcef-48a5-9900-096bf34e2d83', 'row_count': 1, 'bad_row_count': 0, 'load_result': None}
03/31/2023 01:42:28 PM - INFO: Attempting to delete original file_inventory records.
TDR Job ID: ykD

TDR Job ID: RgWm3B9lTZahjbyVkSG44w
03/31/2023 01:47:37 PM - INFO: File relationships inference ingest succeeded: {'dataset_id': '700303c2-fcef-48a5-9900-096bf34e2d83', 'dataset': 'ANVIL_CCDG_Broad_MI_BRAVE_GRU_WES_20221107', 'table': 'anvil_activity', 'path': None, 'load_tag': 'File relationships inference ingest for 700303c2-fcef-48a5-9900-096bf34e2d83', 'row_count': 2993, 'bad_row_count': 0, 'load_result': None}
03/31/2023 01:47:37 PM - INFO: Running dangling foreign key resolution.
03/31/2023 01:47:37 PM - INFO: Attempting to identify the TDR object, and collect and parse its schema...
03/31/2023 01:47:38 PM - INFO: Attempting to identify and remediate dangling foreign keys...
03/31/2023 01:47:38 PM - INFO: Identifying dangling foreign keys for anvil_donor...
03/31/2023 01:47:39 PM - INFO: Identifying dangling foreign keys for anvil_antibody...
03/31/2023 01:47:41 PM - INFO: Identifying dangling foreign keys for anvil_biosample...
03/31/2023 01:47:43 PM - INFO: Identifying dangling 

Unnamed: 0,Dataset,Time,Step,Task,Status,Message
0,ANVIL_CCDG_Broad_MI_BRAVE_GRU_WES_20221107 (700303c2-fcef-48a5-9900-096bf34e2d83),2023-03-31 13:43:34,Transform Artifact Retrieval,Confirm Transform Artifact Retrieval,Success,
1,ANVIL_CCDG_Broad_MI_BRAVE_GRU_WES_20221107 (700303c2-fcef-48a5-9900-096bf34e2d83),2023-03-31 13:43:45,Transformed Files Creation,File: anvil_activity.json,Success,
2,ANVIL_CCDG_Broad_MI_BRAVE_GRU_WES_20221107 (700303c2-fcef-48a5-9900-096bf34e2d83),2023-03-31 13:43:53,Transformed Files Creation,File: anvil_biosample.json,Success,
3,ANVIL_CCDG_Broad_MI_BRAVE_GRU_WES_20221107 (700303c2-fcef-48a5-9900-096bf34e2d83),2023-03-31 13:43:58,Transformed Files Creation,File: anvil_dataset.json,Success,
4,ANVIL_CCDG_Broad_MI_BRAVE_GRU_WES_20221107 (700303c2-fcef-48a5-9900-096bf34e2d83),2023-03-31 13:44:05,Transformed Files Creation,File: anvil_donor.json,Success,
5,ANVIL_CCDG_Broad_MI_BRAVE_GRU_WES_20221107 (700303c2-fcef-48a5-9900-096bf34e2d83),2023-03-31 13:44:12,Transformed Files Creation,File: anvil_file.json,Success,
6,ANVIL_CCDG_Broad_MI_BRAVE_GRU_WES_20221107 (700303c2-fcef-48a5-9900-096bf34e2d83),2023-03-31 13:44:18,Transformed Files Creation,File: anvil_project.json,Success,
7,ANVIL_CCDG_Broad_MI_BRAVE_GRU_WES_20221107 (700303c2-fcef-48a5-9900-096bf34e2d83),2023-03-31 13:44:18,TDR Schema Extension,Extend TDR Schema,Success,No new tables or relationships to add to the TDR schema.
8,ANVIL_CCDG_Broad_MI_BRAVE_GRU_WES_20221107 (700303c2-fcef-48a5-9900-096bf34e2d83),2023-03-31 13:44:39,Dataset Ingests,Table: anvil_activity - File: anvil_activity.json,Success,"Job_ID: iflXmnzUQsyo3igv9uipjQ - Truncated Response: {'dataset_id': '700303c2-fcef-48a5-9900-096bf34e2d83', 'dataset': 'ANVIL_CCDG_Broad_MI_BRAVE_GRU_WES_20221107', 'table': 'anvil_activity', 'path': 'gs://fc-2a9eefc3-0302-427f-9ac3-82f078741c03/ingest_pipeline/output/transformed/anvil/700303c2-fcef-48a5-9900-096bf34e2d83/table_data/anvil_activity.json', 'load_tag': 'Ingest for 700303c2-fcef-48a5-9900-096bf34e2d83', 'row_count': 1496, 'bad_row_count': 0, 'load_result': None}"
9,ANVIL_CCDG_Broad_MI_BRAVE_GRU_WES_20221107 (700303c2-fcef-48a5-9900-096bf34e2d83),2023-03-31 13:44:59,Dataset Ingests,Table: anvil_biosample - File: anvil_biosample.json,Success,"Job_ID: 4lJLTcMnSp-vQbj1duD0VQ - Truncated Response: {'dataset_id': '700303c2-fcef-48a5-9900-096bf34e2d83', 'dataset': 'ANVIL_CCDG_Broad_MI_BRAVE_GRU_WES_20221107', 'table': 'anvil_biosample', 'path': 'gs://fc-2a9eefc3-0302-427f-9ac3-82f078741c03/ingest_pipeline/output/transformed/anvil/700303c2-fcef-48a5-9900-096bf34e2d83/table_data/anvil_biosample.json', 'load_tag': 'Ingest for 700303c2-fcef-48a5-9900-096bf34e2d83', 'row_count': 1496, 'bad_row_count': 0, 'load_result': None}"


03/31/2023 01:50:22 PM - INFO: Function completed successfully.
03/31/2023 01:50:22 PM - INFO: Processing md5-added files for Dataset ID = a715c70d-da92-43ee-a851-1a27277909a2
03/31/2023 01:50:22 PM - INFO: Retrieving necessary information from TDR.
03/31/2023 01:50:22 PM - INFO: Processing updates for pggb.
03/31/2023 01:50:25 PM - INFO: 8 records to process.
03/31/2023 01:50:26 PM - INFO: Submitting ingest for updated pggb records.
TDR Job ID: 1fd9nwmgSyu1ChY0zxaJ1Q
03/31/2023 01:51:09 PM - INFO: Ingest succeeded: {'dataset_id': 'a715c70d-da92-43ee-a851-1a27277909a2', 'dataset': 'ANVIL_HPRC_20230310', 'table': 'pggb', 'path': None, 'load_tag': 'File ref fields patch for pggb in a715c70d-da92-43ee-a851-1a27277909a2', 'row_count': 8, 'bad_row_count': 0, 'load_result': {'loadSummary': {'loadTag': 'File ref fields patch for pggb in a715c70d-da92-43ee-a851-1a27277909a2', 'jobId': '1fd9nwmgSyu1ChY0zxaJ1Q', 'totalFiles': 0, 'succeededFiles': 0, 'failedFiles': 0, 'notTriedFiles': 0}, 'loadFi

Unnamed: 0,dataset_id,run_status
0,700303c2-fcef-48a5-9900-096bf34e2d83,Success
1,a715c70d-da92-43ee-a851-1a27277909a2,Failure - Table Processing


In [None]:
# # Testing
# dataset_id = 'bc6075ac-5cfe-4613-8601-36ceb614939e'

# logging.info(f"Processing md5-added files for Dataset ID = {dataset_id}")

# ## Retrieve dataset information
# logging.info("Retrieving necessary information from TDR.")
# src_schema_dict = {}
# api_client = utils.refresh_tdr_api_client()
# datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
# try:
#     response = datasets_api.retrieve_dataset(id=dataset_id, include=["SCHEMA", "ACCESS_INFORMATION"]).to_dict()
#     src_schema_dict["tables"] = response["schema"]["tables"]
#     bq_project = response["access_information"]["big_query"]["project_id"]
#     bq_dataset = response["access_information"]["big_query"]["dataset_name"]
# except Exception as e:
#     logging.error("Error retrieving information from TDR. Exiting function. Error: {}".format(e))
#     #return "Failure - Pre-processing"

# ## Parse TDR schema to identify file reference fields
# table_dict = {}
# for table in src_schema_dict["tables"]:
#     if table["name"] in ["file_inventory", "anvil_file"]:
#         continue
#     else:
#         col_list = []
#         for column in table["columns"]:
#             if column["datatype"] == "fileref":
#                 col_list.append([column["name"], column["array_of"]])
#         if col_list:
#             table_dict[table["name"]] = col_list

# ## Loop through tables and re-process impacted records
# for table in table_dict.keys():
#     logging.info(f"Processing updates for {table}.")
#     # Retrieve relevant records from BigQuery
#     col_list = []
#     old_cols = ""
#     new_cols = ""
#     join_clause = ""
#     where_clause = ""
#     for idx, col in enumerate(table_dict[table]):
#         column_name = col[0]
#         col_list.append(column_name)
#         if idx == 0: 
#             old_cols += column_name
#             where_clause += f"t.{column_name} IN (SELECT file_ref FROM file_list)"
#         else:
#             old_cols += ", " + column_name
#             where_clause += f" OR t.{column_name} IN (SELECT file_ref FROM file_list)"
#         new_cols += f", CASE WHEN t{idx}.source_name IS NOT NULL THEN TO_JSON(STRUCT(t{idx}.source_name AS sourcePath, t{idx}.target_path AS targetPath)) END AS {column_name}"
#         join_clause += f" LEFT JOIN load_hist t{idx} ON t.{column_name} = t{idx}.file_id"

#     query = """WITH 
#         file_list AS (SELECT * FROM `{project}.{dataset}.file_inventory` WHERE md5_hash IS NULL),
#         load_hist AS (SELECT * FROM `{project}.{dataset}.datarepo_load_history` WHERE state = 'succeeded')
#         SELECT t.* EXCEPT({old_cols}){new_cols}
#         FROM `{project}.{dataset}.{table}` t {joins} WHERE {where}""".format(project=bq_project, dataset=bq_dataset, table=table, old_cols=old_cols, new_cols=new_cols, joins=join_clause, where=where_clause)
#     try:
#         client = bigquery.Client()
#         res = client.query(query).result()
#         if res.total_rows > 0:
#             logging.info(f"{res.total_rows} records to process.")
#             df = res.to_dataframe()
#             records_json = df.to_json(orient='records')
#             records_list = json.loads(records_json)
#         else:
#             logging.info("No records to process.")
#             records_list = []
#     except Exception as e:
#         logging.error(f"Error retrieving update records from BigQuery: {str(e)}")
#         break
#         #return "Failure - Table Processing"
#     # Ingest updated records back to TDR dataset
#     try:
#         datarepo_row_ids = []
#         for record in records_list:
#             datarepo_row_ids.append(record.pop("datarepo_row_id", None))
#             for col in col_list:
#                 record[col] = json.loads(record[col])
#         if records_list:
#             ingest_updated_records("e0e03e48-5b96-45ec-baa4-8cc1ebf74c61", dataset_id, table, records_list)
#             delete_old_records(dataset_id, table, datarepo_row_ids)
#     except Exception as e:
#         logging.error(f"Error replacing TDR records: {str(e)}")
#         break
#         #return "Failure - Table Processing"

# # ## Re-process file_inventory
# # logging.info(f"Processing updates for file_inventory.")
# # # Retrieve relevant records from BigQuery
# # query = """WITH 
# #     file_list AS (SELECT file_ref FROM `{project}.{dataset}.file_inventory` WHERE md5_hash IS NULL),
# #     load_hist AS (SELECT * FROM `{project}.{dataset}.datarepo_load_history` WHERE state = 'succeeded')
# #     SELECT t1.*, CASE WHEN t2.source_name IS NOT NULL THEN TO_JSON(STRUCT(t2.source_name AS sourcePath, t2.target_path AS targetPath)) END AS file_ref
# #     FROM `{project}.{dataset}.file_inventory` t1
# #       INNER JOIN load_hist t2 ON t1.file_ref = t2.file_id
# #     WHERE file_ref IN (SELECT file_ref FROM file_list)""".format(project=bq_project, dataset=bq_dataset)
# # try:
# #     client = bigquery.Client()
# #     res = client.query(query).result()
# #     if res.total_rows > 0:
# #         logging.info(f"{res.total_rows} records to process.")
# #         df = res.to_dataframe()
# #         records_json = df.to_json(orient='records')
# #         records_list = json.loads(records_json)
# #     else:
# #         logging.info("No records to process.")
# #         records_list = []
# # except Exception as e:
# #     logging.error(f"Error retrieving update records from BigQuery: {str(e)}")
# #     #return "Failure - File Inventory Processing"
# # # Loop through records and update md5_hash from GCS metadata
# # try:
# #     storage_client = storage.Client()
# #     datarepo_row_ids = []
# #     for record in records_list:
# #         bucket = re.match('gs:\/\/([a-z0-9\-]+)', record["uri"]).group(1)
# #         obj = re.match('gs:\/\/[a-z0-9\-]+\/([A-Za-z0-9\-_\/\.]+)', record["uri"]).group(1)
# #         bucket = storage_client.bucket(bucket, user_project="anvil-datastorage")
# #         blob = bucket.get_blob(obj)
# #         record["md5_hash"] = blob.md5_hash
# #         datarepo_row_ids.append(record.pop("datarepo_row_id", None))
# # except Exception as e:
# #     logging.error(f"Error retrieving file metadata from GCS: {str(e)}")
# #     #return "Failure - File Inventory Processing"
# # # Ingest updated records back to TDR dataset
# # try:
# #     if records_list:
# #         ingest_updated_records("e0e03e48-5b96-45ec-baa4-8cc1ebf74c61", dataset_id, "file_inventory", records_list)
# #         delete_old_records(dataset_id, "file_inventory", datarepo_row_ids)         
# # except Exception as e:
# #     logging.error(f"Error replacing TDR records: {str(e)}")
# #     #return "Failure - File Inventory Processing"

# # ## Empty anvil_% tables
# # logging.info("Clearing out existing anvil_% tables")
# # table_list = ["anvil_activity", "anvil_alignmentactivity", "anvil_antibody", "anvil_assayactivity", "anvil_biosample", "anvil_diagnosis", "anvil_donor", "anvil_file", "anvil_sequencingactivity", "anvil_variantcallingactivity"]
# # for table in table_list:
# #     try:
# #         datarepo_row_ids = collect_all_datarepo_rows(dataset_id, table)
# #         if datarepo_row_ids:
# #             delete_old_records(dataset_id, table, datarepo_row_ids)
# #     except Exception as e:
# #         logging.error(f"Error clearing out existing anvil_% records: {str(e)}")
# #         break
# #         #return "Failure - anvil_% Record Deletion"

# # ## Re-run T pipeline without validation
# # params = {}
# # params["ws_name"] = ws_name
# # params["ws_project"] = ws_project
# # params["ws_bucket"] = ws_bucket
# # params["ws_bucket_name"] = ws_bucket_name
# # params["profile_id"] = "e0e03e48-5b96-45ec-baa4-8cc1ebf74c61" 
# # params["mapping_target"] = "anvil"
# # params["skip_transforms"] = False
# # params["transform_list_override"] = [] # Leave empty to run transforms for all files, otherwise populate with target table names 
# # params["skip_schema_extension"] = False
# # params["skip_ingests"] = False
# # params["ingest_list_override"] = [] # Leave empty to run ingests for all files, otherwise populate with target table names
# # params["skip_file_relation_inference"] = False
# # params["skip_dangling_fk_resolution"] = False
# # params["skip_supplementary_file_identification"] = False
# # params["skip_snapshot_creation"] = False
# # params["snapshot_readers_list"] = ["azul-anvil-prod@firecloud.org"] # Include "auth-domain" to add the auth domain(s) as a reader (if one exists)
# # params["skip_data_validation"] = True
# # try:
# #     api_client = utils.refresh_tdr_api_client()
# #     datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
# #     dataset_info = datasets_api.retrieve_dataset(id=dataset_id, include=["SCHEMA", "ACCESS_INFORMATION", "PROPERTIES"]).to_dict()
# #     dataset_name = dataset_info["name"]
# #     phs_id = dataset_info["phs_id"]
# #     consent_name = dataset_info["properties"]["consent_name"]
# #     auth_domains = dataset_info["properties"]["auth_domains"]
# #     src_workspaces = dataset_info["properties"]["source_workspaces"]
# # except:
# #     dataset_name = ""
# #     return "Failure - Dataset Retrieval for T Pipeline"
# # if dataset_name:
# #     params["dataset_id"] = dataset_id
# #     params["dataset_name"] = dataset_name
# #     params["phs_id"] = phs_id
# #     params["consent_name"] = consent_name
# #     params["auth_domains"] = auth_domains
# #     utils.run_t_pipeline(params)

# # Return success message if no failures recorded
# logging.info("Function completed successfully.")
# #return "Success"


In [None]:
# for idx, record in enumerate(records_list):
#     if record["library_2_estimated_library_size"]:
#         print(str(idx) + " - " + str(record["library_2_estimated_library_size"]))

In [None]:
# records_list[50]

# Add new supplementary workspace files to TDR dataset

## Script to identify new supplementary files and ingest them to TDR dataset

In [None]:
def ingest_supplementary_files(dataset_id):
    
    # Retrieve dataset details
    logging.info("Retrieving dataset details.")
    api_client = utils.refresh_tdr_api_client()
    datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
    dataset_details = datasets_api.retrieve_dataset(id=dataset_id, include=["ACCESS_INFORMATION", "PROPERTIES"]).to_dict()
    try:
        source_workspaces = dataset_details["properties"]["source_workspaces"]
        bq_project = dataset_details["access_information"]["big_query"]["project_id"]
        bq_schema = dataset_details["access_information"]["big_query"]["dataset_name"]
    except Exception as e:
        return "Failure - Issue Retrieving Dataset Info"
    
    # Use source workspace(s) to find workspace bucket(s) to look for new files
    logging.info("Determining source workspace bucket(s).")
    data_files_src_buckets = {}
    for ws in source_workspaces:
        try:
            ws_attributes = utils.get_workspace_attributes("anvil-datastorage", ws)
            src_bucket = ws_attributes["bucketName"] if ws_attributes.get("bucketName") else ""
            if not src_bucket:
                return "Failure - Issue Retrieving Source Buckets"
            elif src_bucket not in data_files_src_buckets:
                data_files_src_buckets[src_bucket] = {
                    "include_dirs": [],
                    "exclude_dirs": []
                }
        except Exception as e:
            return "Failure - Issue Retrieving Source Buckets"
    
    # Pull existing file inventory from BigQuery
    logging.info("Pulling existing file inventory records.")
    client = bigquery.Client()
    query = """SELECT uri FROM `{project}.{schema}.file_inventory`""".format(project = bq_project, schema = bq_schema)
    file_list = []
    try:
        output = client.query(query).result()
        if output.total_rows > 0:
            for row in output:
                file_list.append(row.uri)
    except Exception as e:
            return "Failure - Issue Retrieving Existing File Inventory Records"
        
    # Build file inventory from workspace bucket(s)
    logging.info("Building new file inventory.")
    params = {}
    params["data_files_src_buckets"] = data_files_src_buckets
    params["google_project"] = "terra-349c8d95"
    params["file_inventory_dir"] = "ingest_pipeline/input/temp/data_files/file_inventory"
    inventory, retry_count = bfi.build_inventory(params)
    
    # Diff files to ingest
    logging.info("Diffing new and existing file inventory records.")
    ingest_list = []
    for file in inventory:
        if file["uri"] not in file_list:
            ingest_list.append(file)
    df_inventory = pd.DataFrame(ingest_list)
    records_dict = df_inventory.to_dict(orient="records")
    return records_dict
    
    # Build, submit, and monitor ingest request
    ingest_request = {
        "table": "file_inventory",
        "profile_id": "e0e03e48-5b96-45ec-baa4-8cc1ebf74c61",
        "ignore_unknown_values": True,
        "resolve_existing_files": True,
        "updateStrategy": "replace",
        "format": "array",
        "load_tag": "Supplementary file ingest for {}".format(dataset_id),
        "records": records_dict
    }
    attempt_counter = 0
    while True:
        try:
            api_client = utils.refresh_tdr_api_client()
            datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
            ingest_request_result, job_id = utils.wait_for_tdr_job(datasets_api.ingest_dataset(id=dataset_id, ingest=ingest_request))
            return "Success"
        except Exception as e:
            logging.error("Error on file relationships inference ingest: {}".format(str(e)))
            attempt_counter += 1
            if attempt_counter < 2:
                sleep(10)
                continue
            else:
                return f"Failure - Ingest error: {str(e)}"
    
# # Loop through datasets and ingest additional files if necessary
# dataset_id_list = [
# 'd74b26d5-24bb-4696-84c3-bcd1f5f90b08',
# ]
# results = []
# for dataset_id in dataset_id_list:
#     status = ingest_supplementary_files(dataset_id) 
#     results.append([dataset_id, status])
#     results_df = pd.DataFrame(results, columns = ["dataset_id", "run_status"])
# display(results_df)


In [None]:
dataset_id = 'bf9108b6-bebc-4b3b-8517-6a2cce5f7d89'

# Retrieve dataset details
logging.info("Retrieving dataset details.")
api_client = utils.refresh_tdr_api_client()
datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
dataset_details = datasets_api.retrieve_dataset(id=dataset_id, include=["ACCESS_INFORMATION", "PROPERTIES"]).to_dict()
try:
    source_workspaces = dataset_details["properties"]["source_workspaces"]
    bq_project = dataset_details["access_information"]["big_query"]["project_id"]
    bq_schema = dataset_details["access_information"]["big_query"]["dataset_name"]
except Exception as e:
    print("Failure - Issue Retrieving Dataset Info") 

# Use source workspace(s) to find workspace bucket(s) to look for new files
logging.info("Determining source workspace bucket(s).")
data_files_src_buckets = {}
for ws in source_workspaces:
    try:
        ws_attributes = utils.get_workspace_attributes("anvil-datastorage", ws)
        src_bucket = ws_attributes["bucketName"] if ws_attributes.get("bucketName") else ""
        if not src_bucket:
            print("Failure - Issue Retrieving Source Buckets")
        elif src_bucket not in data_files_src_buckets:
            data_files_src_buckets[src_bucket] = {
                "include_dirs": [],
                "exclude_dirs": []
            }
    except Exception as e:
        print("Failure - Issue Retrieving Source Buckets")

# Pull existing file inventory from BigQuery
logging.info("Pulling existing file inventory records.")
client = bigquery.Client()
query = """SELECT uri FROM `{project}.{schema}.file_inventory`""".format(project = bq_project, schema = bq_schema)
print(query)
file_list = []
try:
    output = client.query(query).result()
    if output.total_rows > 0:
        for row in output:
            file_list.append(row.uri)
except Exception as e:
        print("Failure - Issue Retrieving Existing File Inventory Records")

# Build file inventory from workspace bucket(s)
logging.info("Building new file inventory.")
params = {}
params["data_files_src_buckets"] = data_files_src_buckets
params["google_project"] = "terra-349c8d95"
params["file_inventory_dir"] = "ingest_pipeline/input/temp/data_files/file_inventory"
inventory, retry_count = bfi.build_inventory(params)

# Diff files to ingest
logging.info("Diffing new and existing file inventory records.")
ingest_list = []
for file in inventory:
    if file["uri"] not in file_list:
        ingest_list.append(file)
df_inventory = pd.DataFrame(ingest_list)
records_list = df_inventory.to_dict(orient="records")
records_cnt = len(records_list)
logging.info(f"New file inventory records to ingest: {records_cnt}")

# Break records to ingest into chunks if necessary
chunk_size = 100000
chunk_cnt = math.ceil(records_cnt/chunk_size)
for i in range(0, chunk_cnt):
    if i == 0:
        start_row = 0
        end_row = chunk_size
    else:
        start_row = (i*chunk_size) + 1
        end_row = min((i+1)*chunk_size, records_cnt)
    # Write out chunk to file for ingest
    destination_file = "file_inventory_" + str(i) + ".json"
    with open(destination_file, "w") as outfile:
        for idx, val in enumerate(records_list):
            if idx >= start_row and idx <= end_row:
                json.dump(val, outfile)
                if idx < end_row:
                    outfile.write("\n")
    !gsutil cp $destination_file $ws_bucket/ingest_pipeline/input/temp 2> stdout   
    # Build, submit, and monitor ingest request
    logging.info(f"Ingesting new file inventory records into TDR (chunk #{i}).")
    ingest_request = {
        "table": "file_inventory",
        "profile_id": "e0e03e48-5b96-45ec-baa4-8cc1ebf74c61",
        "ignore_unknown_values": True,
        "resolve_existing_files": True,
        "updateStrategy": "replace",
        "format": "json",
        "load_tag": "Supplementary file ingest for {}".format(dataset_id),
        "bulkMode": True,
        "path": f"{ws_bucket}/ingest_pipeline/input/temp/{destination_file}"
    }
    attempt_counter = 0
    while True:
        try:
            api_client = utils.refresh_tdr_api_client()
            datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
            ingest_request_result, job_id = utils.wait_for_tdr_job(datasets_api.ingest_dataset(id=dataset_id, ingest=ingest_request))
            print("Success")
            break
        except Exception as e:
            logging.error("Error on new file inventory records ingest: {}".format(str(e)))
            attempt_counter += 1
            if attempt_counter < 2:
                sleep(10)
                continue
            else:
                print(f"Failure - Ingest error (chunk #{i}): {str(e)}")
                break