In [None]:
#!pip install --upgrade data_repo_client

In [2]:
# Imports
import import_ipynb
import ingest_pipeline_utilities as utils
import data_repo_client
from google.cloud import bigquery
from google.cloud import storage
import google.auth
import google.auth.transport.requests
import pandas as pd
import pandas_gbq
import datetime
import os
import re
import time
import requests
import logging
import json
ws_name = os.environ["WORKSPACE_NAME"]
ws_project = os.environ["WORKSPACE_NAMESPACE"]
ws_bucket = os.environ["WORKSPACE_BUCKET"]
ws_bucket_name = re.sub('^gs://', '', ws_bucket)

# Display options
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option("display.max_colwidth", None)
pd.set_option('display.width', 1000)
pd.set_option('display.colheader_justify', 'center')
pd.set_option('display.precision', 3)


# TDR Reader Management

## Remove Undesired Readers from TDR Datasets

In [None]:
# Function to remove erroneous readers from snapshot
def clean_up_ad_readers(snapshot_id, readers):
    print("Cleaning up readers for {}...".format(snapshot_id))
    reader_list = readers
    api_client = utils.refresh_tdr_api_client()
    snapshots_api = data_repo_client.SnapshotsApi(api_client=api_client)
    
    # Retrieve snapshot, grab auth_domain
    if '$AUTH_DOMAIN' in reader_list:
        snapshot_response = snapshots_api.retrieve_snapshot(id=snapshot_id)
        snapshot_name = snapshot_response.name
        print("Snapshot name: {}".format(snapshot_name))
        try:
            auth_domain_list = snapshot_response.source[0].dataset_properties["auth_domains"]
        except:
            auth_domain_list = []
        for ad in auth_domain_list:
            reader_list.append(ad + "@firecloud.org")

    # Retrieve snapshot policies and delete readers that aren't in reader list
    snapshot_policy_response = snapshots_api.retrieve_snapshot_policies(id=snapshot_id)
    delete_count = 0
    for policy in snapshot_policy_response.policies:
        if policy.name == "reader":
            for policymember in policy.members:
                if policymember not in reader_list:
                    api_client = utils.refresh_tdr_api_client()
                    snapshots_api = data_repo_client.SnapshotsApi(api_client=api_client)
                    retry_count = 0
                    while retry_count < 1:
                        try:
                            delete_response = snapshots_api.delete_snapshot_policy_member(id=snapshot_id, policy_name="reader", member_email=policymember)
                            delete_count += 1
                            break
                        except:
                            retry_count += 1
                        
    # Print results
    snapshot_policy_response = snapshots_api.retrieve_snapshot_policies(id=snapshot_id)
    print(f"\t{delete_count} erroneous readers deleted.")
    
    for role in snapshot_policy_response.policies:
        if role.name == "reader":
            rem_readers = ", ".join(role.members)
            print(f"\tRemaining readers: {rem_readers}")
    return 

# Clean-up snapshots
reader_list = ["azul-anvil-prod@firecloud.org"]#, '$AUTH_DOMAIN']
snapshot_id_list = [
'b0fc6253-d274-4e53-9977-85d943116f7c',
]
for snapshot_id in snapshot_id_list:
    clean_up_ad_readers(snapshot_id, reader_list)


## Add Auth Domain Users to TDR Datasets

In [None]:
# Function to remove erroneous readers from snapshot
def restore_ad_readers(snapshot_id):
    print("Restoring AD readers for {}...".format(snapshot_id))
    api_client = utils.refresh_tdr_api_client()
    snapshots_api = data_repo_client.SnapshotsApi(api_client=api_client)
    
    # Retrieve snapshot, grab auth_domain
    reader_list = []
    snapshot_response = snapshots_api.retrieve_snapshot(id=snapshot_id)
    snapshot_name = snapshot_response.name
    print("Snapshot name: {}".format(snapshot_name))
    try:
        auth_domain_list = snapshot_response.source[0].dataset_properties["auth_domains"]
    except:
        auth_domain_list = []
    for ad in auth_domain_list:
        reader_list.append(ad + "@firecloud.org")
    
    # Add auth_domain groups as readers on the snapshot
    for ad in reader_list:
        add_response = snapshots_api.add_snapshot_policy_member(id=snapshot_id, policy_name="reader", policy_member={"email": ad})
                        
    # Print results
    snapshot_policy_response = snapshots_api.retrieve_snapshot_policies(id=snapshot_id)
    for role in snapshot_policy_response.policies:
        if role.name == "reader":
            rem_readers = ", ".join(role.members)
            print(f"\tCurrent readers: {rem_readers}")
    return 

# Clean-up snapshots
snapshot_id_list = [
'0377b6a0-a203-46f3-a4e4-0238b21ce141',
'5184edeb-81f8-406b-926a-64604090904e',
'757824d3-599f-4fab-985d-9ed847d06a62',
'768c753f-6c78-4de0-98f8-80ee7878f23c',
'1d385cfc-4bed-4f52-8f7b-ea54fc44b4f7',
'dd00a8ba-ac49-481b-8d79-0e440adafd77',
'56876495-51a7-4d6e-b6ab-46f3da5b8100',
'120f4368-ef14-4ad2-8c70-a2667b3f8033',
'4d995f30-0c7e-4a98-88a7-1a7e58cbeef2',
'b7a9f284-01a1-47a6-a1bf-57ce7b2e674b',
'8ee64987-5785-4b05-b5eb-5ee4e074c558',
'428c8260-1b27-446c-8484-a28341b41dcc',
'20a5cdc2-bd2d-478c-8398-2b219565c290',
'1b20f271-5af5-4b72-8a81-d31ac8fac5f9',
'0b0de78b-bd70-4a78-8963-9e66f04b86d1',
'e4cc5f9f-a277-481e-9563-6d30035578ef',
'804f1129-2ad2-4f0e-8751-a60ccb14bdbf',
'1c9ddf6e-e641-4b2b-84fc-c7966efa1e66',
'6d70ee06-d809-443f-b018-0bc23cd880ea',
'00d059f0-afc3-4c72-a61d-6925194e220d',
'794ab48a-2707-4713-937f-492b01af56d3',
'b7731031-2e57-4948-8900-a6d549c3fd1f',
'5fb13a59-b09f-42c2-9a02-9be2e9d16e91',
'689891c3-a977-4aa3-a507-6343e177eedc',
'f6d8456b-4ebd-487a-a71d-00337cc0c7a0',
'c9d30b32-ae82-475e-a8bc-d88e0c489aee',
'6c22fd04-7226-4aba-900e-0060ff255b0a',
'0cd0b54e-de28-406e-a0e1-53ac23a0e683',
'26df2a34-b10d-4361-ba2b-d9f966d09f61',
'ba915a8d-24d0-4a94-9220-4f1d058521a1',
'6511b7df-04a2-499c-b940-7aa6e337abbd',
'2e342ea0-83e3-4df9-ad9b-867ce04c14b1',
'41ded0db-2a2a-451b-8a9e-0d94c2a81fe1',
'7e63fe80-9ae7-4c57-b87b-963ef7999c64',
'e0bbd924-bd13-44c7-946e-d89b6becc627',
'2e6dce09-c48f-4aa4-8d76-a4c8bb53b4c7',
'ff2e3cd4-44c4-4068-9e57-3023a3e533c7',
'd658a2fc-808d-479b-8aed-8f2a3f2993dd',
'dcc578ed-44bb-458f-8ff5-a78ca83f4616',
'aa42debe-3747-4dcd-8bc9-24eb90673fa5',
'a2da748b-fec8-4e10-88ee-de32cbe8dee1',
'e5d2f3f5-7bac-40d2-a127-1e82a658df52',
'28dc8121-5e55-46c2-8313-681de2298986',
'72c24fde-ebd5-4544-b8a4-4bad9a919add',
'36bbcc2b-0aee-4932-b575-d9975b296b1b',
'7e59197f-b859-4279-add3-de24bbc7e52b',
'5136703e-fbe0-4ca8-806b-3942eebf67ec',
'0df983d7-ed5e-44d2-acf1-686822b0cc7e',
'c02ebd95-b9aa-478c-8f30-937f203019ab',
'624fef99-e4ce-4c12-a3d9-90995b5da970',
'a68d3145-81c2-41f8-9944-5e4a5058934a',
'3ec72891-87d2-431f-850c-e52013330ea8',
'ea82e45a-b5f3-41a1-a392-08cb3ac6d585',
'b47d81b7-bb16-4b52-9f82-3f315cfc3d64',
'30851e99-bbd0-48d3-b4f0-e3525b0506ca',
'50a37ecf-071a-4f8f-9c72-70280973f9eb',
'ad71f2cb-a73e-463c-b0c2-560fa0f7bc67',
'895f4ecd-fdda-4e85-8fee-be0721b74184',
'369de272-7e29-4f76-8f26-87d4d941fa38',
'e9dcabec-7cc0-482b-83a5-f596e7a98db0',
'899bc1e0-c708-4ebe-8b79-b7e1984995d7',
'1a4cf90f-4deb-4dc2-b72a-25497a8b6b1c',
'f2480e7d-609a-4f35-8f67-9f02561928f7',
'5cd2e542-1090-4dfb-a7a5-b276b32e58dc',
'9a15fed4-ec1f-45be-b468-6e980c3bbbaa',
'556b008a-083e-49a5-bb70-b80b5799e8ea',
'96160d85-eca6-4b6a-ab7f-d33dffef013c',
'3c672fd0-d723-49f4-b2c6-d24d2658a049',
'c68b342e-35c9-4fe5-80ef-2bb821a942bd',
'44373227-8b15-4524-9ecd-57592c52a6f5',
'6e293720-2935-467c-b5fc-0f257eb1fb68',
'e3823ffe-3070-47b2-a0fc-7c0138e6c61a',
'd033876e-acef-4301-b134-d009395d75a9',
'ab1d91ba-6aa6-4a40-8c10-2a979cfb29ab',
'1bb35cec-4174-445b-a646-ff707abc2fd9',
'a632ce72-5246-48fa-a140-f97ee6e9d9b2',
'f87a7821-288b-4bc3-93a2-94ae34604540',
'4a532da2-a8cf-4d5c-9e0c-93c0a1af9084',
'6e6c8a5c-48b5-46d5-856f-28385f67e0fd',
'78cc6f1c-6d17-4344-98d2-e18b0fde2365',
'ae101395-36eb-4d59-9970-6696b82057db',
'8f987445-a04a-4121-ab2e-c34cc8dce719',
'f4a05db7-ff6b-4d75-8e87-68628830160d',
'5c65174d-ed85-405a-96ca-5a41e5930265',
'd431721f-060a-4b9a-b4d5-0d19fbf6ae0e',
'f185a14c-aab5-41e5-a891-74d9653e3e0a',
'6c392a22-a8c7-4e5d-a174-01026284dad5',
'5780d857-a368-4f7c-88d8-2d145552a01f',
'12c4738d-4d27-4776-b7d1-73a6b74fa56a',
'e95d4773-7a36-4031-ba31-920856187300',
'acf4504a-eb85-4aba-9ffb-1baa7266ad82',
'a37d9def-52ca-488e-9468-8e2e211fb3d5',
'd0263ca8-b8c1-4b10-9977-3558104c9154',
'658b1d66-1dfd-4c45-8b54-737a877cff74',
'd70b16e2-ffe1-4e63-837a-1f3e392e9f35',
'3c3f273c-2904-4900-97aa-6638e796598c',
'3984cfaf-0034-4b7e-ae21-8ae9810a62a1',
'f043891a-8919-4e90-8008-9c38c6fbf312',
'5c8ddfe9-2abe-437b-93ed-409c3ea5f488',
'29bb7c35-b2fe-464b-882d-d107e00c04c6',
'c3856d07-55e0-412e-9c36-6363e9520e18',
'cb350574-0522-45fc-b592-181a86cb4d17',
'42d2dc1b-9ffd-41ad-84b9-b92ed984470f',
'f0abbd9c-2c11-4d9b-be8d-de19f18ddfd0',
'728f209a-ef9c-4303-a93f-a7958dc40f0c',
'14bcf9ad-86ff-4983-967b-2a1ce86ae864',
'0ffa30ef-91b1-4908-b148-58191f64c97d',
'68b17a9d-48d2-4996-a3e6-3feb85011706',
'4d39a01d-0ed6-42b5-9200-91b0d848a42b',
'5fb6214a-9594-4ef7-b1a6-d2efd7fb5c87',
'e0e41b16-d394-4cb1-848d-fcaff4a8eac7',
'8eb8326d-a74a-4bee-b4ea-b1d211114996',
'aea7b522-eb2c-451b-b7fa-7bd932f1b971',
'0b2be5d6-4fe1-4afb-9106-2f4dbca31d03',
'63363aed-e5ea-4ba4-8962-da03369ca536',
'b550b4cc-d3a5-4317-bfc0-5e46c77968c3',
'11ecd102-9dc5-4cef-a838-a229b598fc76',
'da06cf38-5f72-439b-9464-fb5448bb6d6b',
'cd24ca43-95f7-432d-b729-3b62d9f95324',
'9fd7edab-f1b6-4fe8-98fd-4ea4c2d34501',
'15227f2d-06d8-4b02-89f8-e59ae4057f88',
'b84e9146-b4d8-4685-b9b1-541b2da269bc',
'f61d6193-468f-45ed-bf0a-75a5662871bd',
'be47d532-a9d8-4a86-bf58-00b2920dd320',
'5630c567-3752-4fa2-8124-c1b8bca37aca',
'52f35032-2afa-4722-983e-e88c8cb808ac',
'51865c0a-9548-4fea-a6e5-c8754a0bb085',
'a1dcd80f-6390-489a-a34a-168f26690a36',
'4fe793f8-96bd-4ece-a8b2-1e4fb6712b99',
'5250257d-e2a3-4cc5-bb07-aa8b03421ad0',
'af99a317-e7a6-4e0f-88fb-f2a6c438ca5d',
'c3bb5d5c-dad2-4762-ac97-a8d920b414b5',
'ee427556-94a5-44d9-84d6-322ed4419ac2',
'213a8eb7-5c74-461a-9677-e04e978cd7e3',
'2c670fb3-af55-4a30-bb31-4ebab5a0d3fe',
'9807332e-22e3-41de-bc41-a9944ba364fc',
'7bedbdb4-ec95-4011-8464-cfb267ff343a',
'29471c9d-7165-46d1-adf1-6a40ed905354',
'807e1239-6442-4a3c-a453-7919033aa03a',
'0d85a6bc-fa74-4933-8537-61d4792159ee',
'da818a37-2a60-4315-a6aa-333ea00e9e6f',
'e7c1e9aa-dab7-4a15-a9e2-cecbeb6989a9',
'208c3ef2-a34a-45ee-858c-38c9dcf86396',
'569905e0-65a3-4e70-b30e-8c731568c443',
'0c011d7f-1aad-49e7-8033-d6b036153f46',
'a0408818-ad55-42fa-a1b9-84537a4b3eed',
'48df4d69-f578-49d8-a320-0bec2fa5711b',
'e6dfa202-d2a0-407d-be70-84cb53c9f9ec',
'c191a23a-926c-4a61-8294-27496a41a4da',
'749af3ac-a652-445b-a2c0-80f24aca15f8',
'3e4fefea-7935-4a4c-bba5-84109c9a800e',
'afd608ce-943f-47e1-8d80-fdf43d58812d',
'2fb44dc8-06a9-4990-914e-63479c185299',
'4e03ae15-3680-4690-95cf-336a86ddd7a5',
'2e8e7c13-3c64-4686-a5fd-0b664bf8510e',
'f461ebf3-239b-445e-9540-7b15b64998c5',
'356dc4cf-688c-4299-b4a0-9c3d839c1490',
'0986817d-bbf8-4614-89b4-68ca7c69b0e1',
'6e429241-ea4e-4273-a92e-3d4978b55047',
'617b50da-87dc-47e2-813e-9271378f3280',
'968929f0-e200-4b68-afb2-f0656d5d6bfc',
'ed56f6ae-6c43-4e1f-b3cd-746e03a29316',
'6c57d44a-2bf2-4b27-aa51-fe341357ab84',
'2c71bf3e-64b3-4b01-b86c-8ee10007b22a',
'db2370df-cc70-4a6a-9146-fc99ff8eddac',
'e036b126-1249-4661-98d4-db6218f351e8',
'e9fa838e-b173-4262-8fb6-e5eef53856ab',
'8c634fb0-da0e-403c-8e4a-13cef21411a7',
'636272e4-d4e2-4a25-ba10-e1d1cb9352bb',
'0acc57c8-5c8e-44ab-bab5-6dcb7b6465ef',
'63a987d4-bfca-46c4-bbbc-afdbf357308a',
'538662ce-44d2-4fb9-ab3d-f02342d26761',
'f58f9cc2-70e0-40fd-8adc-674adc503f8f',
'ffc3e5ac-95e6-464d-92dc-1c4fd1ca394e',
'7406b139-dae3-499e-94f5-b762fee73bd7',
'8b726cda-c018-45de-bb98-39915c912035',
'ea9298ef-2d9c-4237-9a9b-48a8854ee042',
'b6ae2316-8eed-49ea-ab6e-3425a9527549',
'2a8eaabc-68e1-4962-bf1b-332f1b856a78',
'2569d9ca-20bb-41d6-a7ad-505b7a2c33e8',
'f5af85f2-ff96-47d3-97f6-b18585d54d81',
'0a06a398-3638-437c-9e81-5fb96bb6fc9a',
'423099cd-1739-45e6-9225-06bdabcba8f7',
'14810a30-fbc5-459d-b2c7-0378125e25bf',
'b1cb0a2d-9c11-478a-82f7-6b239a6b7ca0',
'1320c44f-c27b-42e3-9870-5676d340e923',
'1918b1ee-fa43-49a2-8e5a-d3730c0c20cc',
'5681d110-8c84-478c-9d1f-7935a54b86ca',
'24c427f6-17b9-4cd1-962f-92a12b090d8a',
'9a5be8f4-eaa7-4358-8fdc-470a6f1da79c',
'6fdea8c7-69d9-466e-9fa2-aca30722ff68',
'8a7b6bf0-dd75-44fe-98f9-43ecd8612145',
'a218159b-1333-4550-a3e0-bf8610425fd8',
'ad6660ad-3052-4f68-8e8a-febd57adb43b',
'e3797059-80ed-463f-89ca-e77589f2fdb4',
'87d02347-d169-4ce0-9027-3c8e11e48c40',
'8857ce53-0bb6-40be-a536-3dc658723419',
'40ebc4a1-94ea-4b5b-adeb-89b171f2a957',
'61b6ae23-ca19-4d31-bad3-2281a8528886',
'7c4edc65-bfe6-4ede-a68a-c0b9d2564f29',
'e511ebe0-d2a3-42ad-b06a-ef083bb6d943',
'f330517e-46fd-4de3-8063-015b524a7324',
'f0d8bb27-1695-4faf-8b27-4b95260b8f17',
'17d14df1-cb64-4aae-8049-c1728a3c0c81',
'434f85e2-4435-483c-8099-b03c8ba794ed',
'148bba37-06ab-40b7-a0d4-a04fc515465d',
'b5271312-1c86-4336-b039-4216f95e298c',
'a588077f-5960-4cd9-96ff-dca22fdbec90',
'c3d22305-b3f2-4561-a5b9-bed82ee742f4',
'9fe2abd4-70b4-4eee-b00d-38726ced8620',
'5329c25e-ccad-435d-9250-6fcc3ff88472',
'ced601b2-9a11-40e9-8067-241e5a5996ed',
'fc513b58-cfb7-4871-8694-8dc372fc2e10',
'4117144f-92e7-454f-9263-dad5e128cadb',
'ce2e7235-26e6-470f-8e05-298193b7f53d',
'ea50255a-45a4-4846-82e3-02b4f46f5b17',
'b763c288-4132-434a-a6c9-25ad51b9d961',
'3bd33dba-f7d4-4a86-81f7-bf0bf9f8d335',
'079eb53c-e2b6-4da6-ab5f-fc2136a3ecc1',
'7e54b39f-cd63-4a3e-bbeb-b76e0774d424',
'ea4c6dab-ad3d-40f0-95c9-e68ee79f7a6c',
'd3dc5627-503b-48a5-ad79-31ab6c2fd417',
'ec14f8cd-5b1b-4124-a235-f11159984c7c',
'6d9e1212-4fa6-4632-be8a-75c45a474dd3',
'667eac9b-4e90-413d-80f3-d857b9829ab7',
'cdd689fd-10f3-4cfa-b738-46549e689cac',
'c091ea30-1862-4b1f-8e92-087b441472c3',
'43c86818-9bfe-46f2-9ae4-4a55a7baef1f',
'ebdaca04-ef29-42f3-8486-a94dade81bf8',
'd091a2a6-53e7-4721-82b8-09ccef9b13cc',
'de2da97c-3a14-4a6d-b50b-5dc8e1af2803',
'2c441f75-dc1d-4674-9118-a93c5141b748',
'8165245c-2003-4ec7-bf57-731959022d47',
'15706251-fde6-4cd1-ad3d-dff1dbb1dd97',
'c4dcf7e0-195a-4885-8864-55a9d65cdb5f',
'77fbf845-e43c-4015-93d3-6acf55d83022',
'cea0dc44-e5ad-4116-aaca-d4c0dea68547',
'b052703f-ad71-44ef-b76a-654cc13fa97e',
'db6d79e6-6064-4619-9e49-d3ee054c8302',
'a2c0bd50-4f89-4f1e-b25e-0f0c56b29b31',
'18dde45b-410e-4046-a051-46885a21c02d',
'30d1fb84-6746-4d0d-8d68-f1c9cf955504',
'9a20df71-4752-429b-9021-917045005452',
'ea0e5966-6573-4e1f-bd11-48f64595fadf',
'1a26532c-16e6-4f1c-81f9-8f07a8181421',
'28559e94-ed57-48c8-bc8b-6cc4ad659a61',
'd67d11a8-4356-4cae-89d2-92e724f93f2e',
'2a1375fc-a976-4327-829f-d0d0f6155cc5',
'ce1bf5c3-525e-455d-a1e9-dd5f3d68c9d3',
'd0a6aa4c-821c-4bba-b53b-4f230ca3cda4',
'c6262801-594b-42d1-bf08-154f64cd76d5',
'c9ae3cd3-2174-4e76-a610-a54c95378a98',
'3ac713b5-3645-4381-ac66-ecbc281a2ab8',
'4911bd18-5db9-418a-9dc0-0ea28ae937d6',
'c5f294ea-87e9-4cbb-8099-2b5401add5a2',
'5fb3cd44-691f-41ef-a009-5a401b5fcae5',
'853c7de3-b0fc-43d7-bd4c-53144d276573',
'33c854eb-d228-4a82-8324-5e455ed1e447',
'd0709a13-9701-437d-848f-fbce26b3bf5b',
'cdda79ce-cf89-45cc-9a78-04a22fa833ce',
'533ba93b-506e-4547-9174-037a6b17835d',
'06216d97-7d1d-4105-bf60-958b71c02cfd',
'51e19b3b-8a51-4e2b-8a9a-bcbb95921a28',
'39e02242-e1bb-4937-b3ee-d7f81e094d75',
'87b55203-983f-467e-b496-9a0d21f4151d',
'f06adf86-4526-47a8-b59a-2bf137e034d2',
'bbd04481-0b9d-4c21-ba65-a43638116e0f',
'dbdfebae-3eb0-4fc5-b744-eb901da3591c',
'7e748fdb-7dd9-418f-957c-7a68f07aaa8d',
'29fa069b-8df8-4fb3-bfa8-01e0504d050c',
'2b78a3ac-8bca-4938-bc7c-26a60f9c04ac',
'4bb891fc-fcae-40cc-bf59-73716de7e04e',
'508b9f8a-c827-4dc0-8319-6aeb90482bdf',
'bfd29198-ca9c-481a-ae8e-d8ec49bdf84a',
'56187783-02d4-46f8-bc8a-cce00125ce58',
'20eb6baa-99b8-4e24-97e3-98a402fbe975',
'f6da1eb2-9dec-48fd-abcd-d98bf2d21e47',
'd370b858-4fb6-413e-8bfb-97f98e8f3d77',
'e7cba2c4-6b44-4d70-9449-472a1e095a65',
'824afdf1-50d9-462f-9f09-db5a1f646bd8',
'2ae00e5c-4aef-4a1e-9eca-d8d0747b5348',
'b9314197-1618-4dd7-8441-38dfb1490389',
'1949b996-2f50-4c66-8656-ee1aea6c9b80',
'761e172c-f530-4154-b5b6-a1c52b0530e6',
'392d09d1-3e69-46bc-84be-679e7bf52d1a',
'e1c34b81-2435-4c12-87d7-3f995cfd4a0a',
'32427168-2013-43bb-a100-89e1b38c8998',
'574e0d42-e712-4a86-be7a-4b3a95187bcd',
'293429af-d91d-4af7-8d8b-cb33aab4a055',
'cab35bdd-4b15-4836-8470-b922d5761602',
'388f45e2-a4ef-48e5-9b69-0128db5a25e2',
'56078c29-a393-4c60-9e04-3674e02fe729',
'011b65c5-fd63-478c-9396-a16c96f61a11',
'16784a9f-1796-4a1f-a0da-a61392fcd127',
'099d2585-1379-4333-b3b1-ffc0d26d95c5',
'19997cd4-25ed-46a9-be6c-77049f1c74eb',
'a35fc432-b9ba-4633-bef7-4e317ff34df5',
'cd19195f-25a0-44b1-b47d-ec99141833fc',
'9a61b980-4a33-465a-bc50-1aba00bc2cf6',
'737d454c-88be-477f-ae2c-ef473e2106ce',
'90fe2016-e79c-456c-a5f9-3a31149fcd65',
'3bdbad9e-f9d4-4442-8606-791d490bf0af',
'02d25240-823f-4b1d-8562-95385716a453',
'1974a21b-c409-4736-a3d7-e195fa96c4eb',
'99b46287-4790-492c-8a12-bea33f0f927c',
'08d19a7e-b868-4766-9f7e-d879d972cbd7',
'c6ef5822-3929-4ae7-b5bc-dc27528bf226',
'e43974fd-cee1-4d8c-a436-6846d7d24129',
'0d607d21-c9c7-4852-83e3-76825176ee0a',
'0a356156-961d-4829-b9b5-c07fbc73dacc',
'4c8ce027-8094-4f5d-bf62-22b1d51b3c1e',
'f8781fbf-5fef-4481-8819-3df1bc724b7f',
'eb7948be-1007-4b0e-b9b6-a5c40bbb9596',
'17d5d317-9378-43bd-a088-726ffbcde6c5',
'07d2b703-db1f-418a-97ed-eeeab088718f',
'f20753f0-d09a-4b47-bffe-8f24ec354761',
'c1c674dd-056a-470c-8874-bf70d8fae3a8',
'aa2bfacc-c28c-4192-960c-b1389cf68516',
'5b036d13-e058-4d8d-be91-6fdd070686a7',
'8fd5b447-77b6-4c33-b66a-a5cc63587220',
'69007912-8f45-4925-a857-57a5c09ca536',
'b7b2f00b-5bac-4996-a23a-1df0d4099157',
'410667f8-8811-47bb-b5cd-ddacba7185e5',
'f4b1bf68-fb9d-47ed-84d9-18d7224da3bf',
'c51470e4-cb99-45d7-8ffc-3d346e557b4d',
'44b1f60b-e74c-4430-9378-d4a75e2de72f',
'a4c62d7f-34f0-4e2e-9e46-c762d3ab0ff2',
'6a5b3be6-d1de-4f23-a431-b08e7ab231b8',
'b735d811-a7ed-4d82-8b9d-5f23a9f33936',
'5208772d-21f9-46b0-8167-0b05b57296b8',
'ffe34538-3ddd-48de-b4a2-94f9b2dad086',
'ad66f7fe-1c4f-4cdb-99ed-7afba867c139',
'632ce3eb-610b-4467-b39f-6adf4a5d2508',
'8fbe2def-b8ad-4b2d-90c9-0dd4517c67e1',
'5955a235-5be6-47bc-8303-ed0c4e68f501',
'bce0dfaa-0351-42ac-8b5b-47dda936bf29',
'36690013-e8bc-43a5-9ba9-83317537557c',
'172bada7-f1c5-41c4-836d-05381beaed9a',
'133e902c-5ff0-4119-8078-db3e15006844',
'03e54581-8fd3-47c3-9143-55368d2e4e86',
'9efd748c-ad09-4765-b645-1b6ef6b5d402',
'2c6de04e-104d-42c8-8448-97d74985dacb',
'452bcafd-ab45-4e24-b5e0-13fcf22b0755',
'fbafdd31-21a0-44c5-ae4d-724839beff61',
'2a1882d9-88ca-4849-bcc1-f6914f593407',
'9efae3c7-904c-48a8-939a-e82b46005ae1',
'3838993f-59ba-4dec-8110-ac3ea387ab91',
'bf2f4106-cee9-419c-b4d1-d7b03a6293d5',
'a6c36f5e-b86c-4164-85ae-8bf0df2e4a90',
'5e547934-c339-410e-a013-dfefed50f4b8',
'ffa84feb-ca0e-43d3-a04d-a402a8e24a3b',
'2be072bd-2153-4050-9358-e4b95297a9bf',
'e04edfef-69f8-47ff-8df9-dfff0e9218d2',
'1851ecd5-5e95-4ca4-afe4-9493d2dc55c0',
'7c19d852-e36a-4353-afea-10e501601d9a',
'fd3843fe-ee5d-4784-b0d2-6673f9886d30',
'84703c54-a9dd-400c-9701-2fc40922e3e3',
'00297802-e20a-413f-b389-a6f764b6600e',
'b8a455eb-827d-43a0-a89b-5d017747140f',
'f2a7be5a-4f7a-4a96-935e-ca7592855b45',
'c853d4c0-d4be-433d-964e-e30bdc35480e',
'ff27037e-cb52-44ef-8979-f6e7ac3ed6f6',
'830df9ed-e4a6-4c9a-a97a-aa080fb030e4',
'9321b908-f2e4-437b-b53e-ed81754dcace',
'7c90289b-be3e-4c9b-917a-d5e27d95dc15',
'0f46a588-b4ff-4a69-99e9-0a0bcf052522',
'3e85b06a-a6ea-4ce8-a655-44b1fce12138',
'faa97a6f-3a33-4cf0-b6dd-c29fea9b1398',
'328745cc-e527-4780-af6f-30ab69d26702',
'6e674477-522f-4adc-8c50-76910a6a282b',
'504089f1-c59d-48fe-84ef-858bd3eb3043',
'e91ccc70-2772-46d8-b586-cf3e270a05b5',
'247dacae-8e6e-4928-ac45-421d19b549a5',
'1c4015ec-c6bf-48c6-9a8f-144498bac5ae',
'4645cb09-c96e-4750-bc8f-b9ba2e61f2fa',
'47844eab-a4b4-4413-9745-6d988ac4100f',
'4b5e77cb-b79c-4a6e-aa67-ab23afa9d141',
'1254ef3c-3f58-41c9-a52e-eafef4492c13',
'e224f92f-0b40-420b-8e7a-dbc268107bc2',
'8873dbd4-e4a4-45db-8908-3c68593031d6',
'2185fe0d-9816-4f11-ab65-ee8f969847fc',
'c5514b78-183b-4b72-9e32-2e473c63086c',
'9251ff73-9932-4ead-ab96-91feb6c56935',
'94abf97d-45c1-441e-8d6c-355e9557b9e6',
'721a0e30-9c7c-4ea2-aa5b-d8a1416e60a1',
'1c9884a4-39ca-4d8b-b6a7-bfb1378b6012',
'd223db2c-cc91-47cd-9fb0-050e0e7940f0',
'bccd9f0a-d1e4-4d48-9d20-91b293e4a57e',
'1be2372c-07c7-434e-be09-1f3ff095ecf1',
'825157b9-d84b-474b-90f6-0994e7bac378',
'4dcde6d0-a57a-4fcf-8281-882f783d0583',
'1a256e91-a1b4-4db8-91cc-3be204872b26',
'd0acae97-256d-44ac-a55f-efe4f6cf2af6',
'696b5a81-c93e-4acd-8448-4b5576d14ea0',
'e00f95e2-92d1-41a8-9bc9-d4978ebc68f9',
'c890024c-40ed-42db-ae45-b119d038461e',
'cc033b0a-6285-426b-8d6c-f29739b62920',
'4022a967-0753-4f74-a682-b980528c112d',
'68af6886-c7de-4a2f-abde-0314a301ac1a',
'ba1a01e5-23e3-417a-a45d-91368dce617c',
'46bb697c-4b2c-4ae2-90d8-4fed2a00f831',
'72d457a1-ea5e-4269-805c-ef25bdb12070',
'ea4e8c79-b6bd-4b24-990e-624de9d15835',
'c98c2d47-ebe2-42cb-adc7-be2475812bea',
'853f9898-8b51-49c6-995c-2a9fb1839d60',
'f3ed7376-db20-4ee8-80e0-535b6ae6b770',
'341c3961-f1dd-4d94-8069-37f382242b18',
'f45becfb-890e-4a71-9b87-ec1deeb8503e',
'fc782fb4-c739-4531-a71d-9388443c319c',
'3096d7dd-458e-4840-a7f9-715aedca92cd',
'd72f1822-4f99-441d-961b-2a5cf6635f42',
'cae2218b-5eab-47e6-bcde-226f212d2bb2',
'7b8b09f2-80cf-49c8-bd74-42b00e850cde',
'ce525190-7d7f-4e57-8176-398cd9b0b7c5',
'b25001a1-1479-4d2e-a63f-298fbe42d8ed',
'd71c41bd-f4a6-423c-b567-b5d6c290b373',
'75b365d3-3f8e-4cd1-9aac-e1fccecf6cde',
'7a886f4b-2a04-4843-a717-62e6283d4254',
'92bd5c0a-fff9-41a5-b107-7f1bcd72bea1',
'09c4a83d-9d0b-4cf7-b04a-0747e656e019',
'15b35c76-49ec-4225-b91e-0ff0b43e8136',
'3ca70728-bffd-4cd0-9bf2-7a479532c9e9',
'1bb3d012-1637-4f61-ba1e-a8549a43973e',
'c8745002-326a-4e17-84d9-3045cfcea085',
'8e73f31d-403a-458a-a1d2-c9048c24310b',
'1415eb56-449e-473e-a8bb-f8616c1ff851',
'6dcadf4a-71db-498f-87be-3b6bcec912e5',
'e31b204e-f42c-4774-a239-91968b13a682',
'f2797094-662f-4041-b373-338d89ac5a7f',
'59e23ef1-8c18-42c3-a075-b5e5e5e16dec',
'0254cc08-1474-4b3c-ae99-f7d853042dc8',
'531db83e-3f7f-4732-81f0-013addbf2a8e',
'f1c03eab-24bc-4b3a-8aa9-d6696dfaaf31',
'f875fdc4-f57d-4a4c-9b22-daf101156d26',
'27068295-b3c0-4260-9447-9ca96814d46f',
'e588585c-4e81-422c-9058-746203958824',
'fd57042b-4676-49ba-9d2e-161c83e0f3bf',
'32025456-3114-4140-a712-d38122f3ee71',
'a39fc400-2146-4949-9a94-fd3d4f1b182c',
'02706895-171d-403f-9f36-fa7e45d09a9c',
'31018599-8ada-472a-a8b3-920f3057f6f3',
'cd1181b4-b3ed-4c78-94af-aed1edbc64b1',
'989ccbba-f39f-460a-95d1-e7542529c26c',
'988c0084-bf6e-4838-846c-373a01f3458c',
'47e551d2-0850-4fdd-98dc-cf01eb6ed839',
'5f95a5f3-319b-426c-afa0-2b4d1773411d',
'eb719163-2dff-41a0-811e-3c00d182a7f6',
'060c707a-2f0d-4730-bbd6-d25489abfcf6',
'fd068962-fa12-435c-87b1-8baca1788839',
'8996212e-d0e0-4305-8638-587cfb61bf8d',
'a3b18d45-96c2-4526-8fde-65ab3265868f',
'e05f5d24-4edb-40e1-a293-533f33c2c86d',
'ea90e903-9835-481e-b3c1-7451d2211de7',
'6b8b2cc4-be14-443e-bda5-eed5fe0ffb2e',
'ddf7ee7d-3234-4f8d-a1e4-305588cd1009',
'b6055f92-ef68-47ba-994a-d51b947a858d',
'cc8cc17d-1ae4-4303-abd2-4728a676e5a2',
'b0dd99ec-f83f-4d76-952e-8c1923fa9710',
'613ec6f4-dafb-4689-b109-4573ddca5853',
'9f4946d0-94ba-45b8-b769-f894790cc1df',
'fb4ca443-d278-4450-bb56-a0b3035c71c6',
'0f477861-a41c-4435-8935-2aface2e62e6',
'c8c3bb66-e4bd-456f-9d38-e82816118807',
'2d5d9ec7-b748-4ae2-806f-f4bd687c36da',
'9a17587a-ae6b-481a-8d88-f479981c767f',
'4dc90a20-5c4c-40da-9b36-beaf740f8983',
'9bc462b1-2cb5-43e6-82bc-ec257ed35455',
'49b3d29d-f734-43e8-9454-ea3ab9631341',
'4602d8f0-a679-4c26-9b83-608d04abab99',
'6301be73-aa22-458e-9562-d87ac6c7d217',
'a08ee68f-0e5f-4cd3-ab88-b3740ddf709a',
'500080ec-6911-4d78-942c-b0d4c7143894',
'142f2d93-f04a-42ae-8e24-0324c9d7863e',
'4fc3ed75-feda-4498-89b3-46024655704b',
'6cdddd59-711f-4d72-8383-cfa349d58a3d',
'f8b5dcf4-e9ff-42d5-860e-f36033d62522',
'6173529b-c677-4fa2-9580-feda9fec3f4f',
'e5424ee2-ebee-494a-ac5e-16d7c56453ac',
'2529f127-cff5-43ff-b879-06bc0e3468ff',
'5bba97dc-d6ab-4329-912f-148c8b807056',
'fa2552f6-b6f8-478a-8fc6-19fb6d612837',
'9cf61d88-d096-4981-b0c6-99db77554c01',
'7c237e08-3329-4e64-bd2a-063be290e78b',
'6df525e1-b143-4e6f-b667-80c783ae1b66',
'92666b7c-4d50-4530-88e9-ea2d3da9d07a',
'42644c25-fa23-4b4e-8fcc-907cd8dcef60',
'155c11a9-638a-45c8-b172-7cf2e3e16fe6',
'b3da9fec-08ad-4496-a9ac-1411388fb5cc',
'0de07296-e3ff-4fe6-9183-9f421484197c',
'1b6273c6-7769-4daf-abee-93b11b322c73',
'eb7045e1-2286-49f1-bce6-21b5d7fa5c32',
'4c722626-c559-4f5a-84bd-8d7d46983e1e',
'510abca2-02d6-4773-909e-70746a444987',
'b78a77b3-0a1b-410c-8afe-193d277e645b',
'1410a32b-4ee6-4bd3-96d1-4848d38769d8',
'b77d83c7-2a8e-4f50-be1a-7848f28dc8cb',
'2efa7d84-2850-4e6f-bb26-7d13ad147b44',
'206009c6-cc98-45ab-b504-e6c3a3162a23',
'e5ccacfe-1b14-4331-bd8f-a542b5a70d23',
'96874f3e-3e02-400a-96d1-5bd20d4cbc09',
'651d2fd2-fc96-47b3-909d-0dd46f575dbc',
'84cfc3d8-282e-4102-ae43-5513e7a3efd5',
'da841552-40bb-4f05-8edf-ad0a76ed13ac',
'40c4297e-d492-4f6a-b651-ee9ee38db14b',
'8956cc4d-58be-46ae-a81e-74607ffbd9d3',
'93d56775-9557-4e88-bb15-007bc86181ae',
'8bb8ac5f-cb35-4253-9516-d80bd581dcb8',
'833ee2f2-9333-401b-8f53-fa4353fef66a',
]
for snapshot_id in snapshot_id_list:
    restore_ad_readers(snapshot_id)
    

# Collect AnVIL Snapshots and Datasets

In [None]:
# Dataset_ID Filter
dataset_id_list = [
    '032d39fb-d278-427d-b7d2-de648a25a20c',
    '119c7ceb-ad4e-4b6c-9f5f-edb08239aee7',
    '27acea14-41c9-4bf9-ad43-3ebb3ce90456',
    '2cace5dc-f660-45d4-b689-c4c89e77697c',
    '68ea655f-b4a3-43e2-95e4-f158ca2d67dd',
    '6fd0f009-3c34-4529-9a38-c59745545490',
    '841b7883-9447-4ea0-ae4a-84ea0240d919',
    '96461004-f4b3-4f82-a842-293b3ec46a60',
    '9f4ac69c-0919-4ac1-98a8-976ed79ace03',
    'a3ea4f97-6657-4d3c-9be6-96f097f5c952',
    'bef62e8a-5f5c-4e81-a8f8-ddeaf657b4e8',
    'd0fc3d6a-c3f4-4533-8a23-817a4e27f9be',
    'd2272f2d-c606-4027-b8ea-0bdd6d9d6535',
    'd596ee91-481c-4eb5-9a8a-88c1e10ba9b6',
    'ec6f49a2-176c-4564-82c5-e751baab46aa',
]

# Collect Anvil datasets and snapshots
current_datetime_string = datetime.datetime.now().strftime("%Y%m%d %H:%M:%S")
logging.info(f"Start time: {current_datetime_string}")
api_client = utils.refresh_tdr_api_client()
datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
snapshots_api = data_repo_client.SnapshotsApi(api_client=api_client)
datasets_list = datasets_api.enumerate_datasets(filter="anvil", limit=2000)
dataset_list_len = min(len(datasets_list.items), len(dataset_id_list))
records_list = []
dataset_count = 0
for dataset_entry in datasets_list.items:
    if len(dataset_id_list) == 0 or dataset_entry.id in dataset_id_list:
        dataset_count += 1
        logging.info(f"Processing dataset {dataset_count} of {dataset_list_len}")
        if re.match("^ANVIL_[a-zA-Z0-9-_]+_[0-9]{8}", dataset_entry.name.upper()):
            dataset_detail = datasets_api.retrieve_dataset(id=dataset_entry.id, include=["PROPERTIES", "DATA_PROJECT"])
            snapshots_list = snapshots_api.enumerate_snapshots(dataset_ids=[dataset_entry.id], limit=1000)
            try:
                source_workspace = ", ".join(dataset_detail.properties["source_workspaces"])
            except:
                source_workspace = ""
            if len(snapshots_list.items) == 0:
                record = [None, None, None, None, None, None, None, None, None, dataset_entry.id, dataset_entry.name, dataset_detail.ingest_service_account, dataset_entry.created_date[0:10], dataset_entry.cloud_platform, dataset_entry.secure_monitoring_enabled, source_workspace]
                records_list.append(record)
            else:
                snapshot_list_len = len(snapshots_list.items)
                snapshot_count = 0
                for snapshot_entry in snapshots_list.items:
                    snapshot_count += 1
                    logging.info(f"Processing snapshot {snapshot_count} of {snapshot_list_len} for dataset {dataset_count}")
                    # Get public policy information
                    creds, project = google.auth.default()
                    auth_req = google.auth.transport.requests.Request()
                    creds.refresh(auth_req)
                    public_flag = "N"
                    public_response = requests.get(
                        url=f"https://sam.dsde-prod.broadinstitute.org/api/resources/v2/datasnapshot/{snapshot_entry.id}/policies/reader/public",
                        headers={"Authorization": f"Bearer {creds.token}"},
                    )
                    if public_response.text == "true":
                        public_flag = "Y"
                    # Get snapshot DUOS ID and Lock status
                    api_client = utils.refresh_tdr_api_client()
                    snapshots_api = data_repo_client.SnapshotsApi(api_client=api_client)
                    snapshot_detail = snapshots_api.retrieve_snapshot(id=snapshot_entry.id, include=["DUOS"])
                    duos_id = ""
                    if snapshot_detail.duos_firecloud_group:
                        duos_id = snapshot_detail.duos_firecloud_group.duos_id
                    lock_name = snapshot_detail.resource_locks.exclusive
                    if lock_name:
                        lock_status = True
                    else:
                        lock_status = False
                    # Get snapshot readers and auth domain
                    snapshot_policy_response = snapshots_api.retrieve_snapshot_policies(id=snapshot_entry.id)
                    for role in snapshot_policy_response.policies:
                        if role.name == "reader":
                            readers = ", ".join(role.members)
                    ad_groups = ""
                    if snapshot_policy_response.auth_domain:
                        ad_groups = ", ".join(snapshot_policy_response.auth_domain)
                    record = [snapshot_entry.id, snapshot_entry.name, snapshot_entry.created_date[0:10], public_flag, readers, ad_groups, duos_id, snapshot_entry.data_project, lock_status, dataset_entry.id, dataset_entry.name, dataset_detail.ingest_service_account, dataset_entry.created_date[0:10], dataset_entry.cloud_platform, dataset_entry.secure_monitoring_enabled, source_workspace]
                    records_list.append(record)
df = pd.DataFrame(records_list, columns =["Snapshot ID", "Snapshot Name", "Snapshot Created Date", "Snapshot Public", "Snapshot Readers", "Snapshot Auth Domain", "Snapshot DUOS ID", "Snapshot Data Project", "Snapshot Locked", "Source Dataset ID", "Source Dataset Name", "Source Dataset SA", "Source Dataset Created Date", "Cloud Platform", "Secure Monitoring", "Source Workspace"])
df_sorted = df.sort_values(["Source Workspace", "Source Dataset Name", "Snapshot Name"], ascending=[True, True, True], ignore_index=True)
current_datetime_string = datetime.datetime.now().strftime("%Y%m%d %H:%M:%S")
logging.info(f"End time: {current_datetime_string}")
display(df_sorted)


# TDR Dataset and Snapshot Stats

## Dataset Stats -- Table Count, Row Count, File Size, etc.

In [None]:
#############################################
## Functions
#############################################

def return_dataset_stats(dataset_id_list, write_out_results, target_bigquery_table, display_results):
    
    # Initialize variables
    agg_results = []
    
    # Loop through and process datasets
    for dataset_id in dataset_id_list:
        
        # Initialize variables
        results = []
    
        # Grab access information from schema
        logging.info(f"Collecting stats for dataset_id {dataset_id}...")
        api_client = utils.refresh_tdr_api_client()
        datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
        try:
            response = datasets_api.retrieve_dataset(id=dataset_id, include=["SCHEMA", "ACCESS_INFORMATION", "PROPERTIES"]).to_dict()
            dataset_name = response["name"]
            tdr_schema_dict = {}
            tdr_schema_dict["tables"] = response["schema"]["tables"]
            bq_project = response["access_information"]["big_query"]["project_id"]
            bq_dataset = response["access_information"]["big_query"]["dataset_name"]
            try:
                source_workspace = response["properties"]["source_workspaces"][0]
            except:
                source_workspace = ""
        except Exception as e:
            err_msg = f"Error retrieving dataset details: {str(e)}"
            logging.error(err_msg)
            results.append([dataset_id, "", "All", 0, 0, 0, 0, 0, "Error", err_msg])

        # Clear records from target BQ table
        logging.info(f"Preparing target BQ table.")
        delete_query = f"""DELETE FROM `{target_bigquery_table}` WHERE dataset_id = '{dataset_id}'"""
        try:
            client = bigquery.Client()
            delete_query_job = client.query(delete_query)
            delete_query_job.result()
        except Exception as e:
            logging.info("Error deleting records for the dataset from the target BQ table.")
        
        # Pull table stats
        logging.info(f"Running data profiling queries.")
        for table in tdr_schema_dict["tables"]:
            table_name = table["name"]
            column_count = len(table["columns"])
            fileref_count = 0
            record_count = 0
            total_bytes = 0
            supp_file_count = 0
            for column in table["columns"]:
                if column["datatype"] == "fileref":
                    fileref_count += 1
            if "anvil_" in table_name:
                table_type = "fss"
            else:
                table_type = "source"
            
            # Build and execute table metrics query
            if table_name == "anvil_file":
                query = "SELECT COUNT(*) AS row_count, SUM(file_size) AS total_bytes, SUM(CASE WHEN is_supplementary = True THEN 1 ELSE 0 END) AS supp_file_count FROM `{project}.{dataset}.anvil_file` ".format(project=bq_project, dataset=bq_dataset, table=table_name)
            else:
                query = "SELECT COUNT(*) AS row_count, 0 AS total_bytes, 0 AS supp_file_count FROM `{project}.{dataset}.{table}` ".format(project=bq_project, dataset=bq_dataset, table=table_name)  
            try:
                client = bigquery.Client()
                df_results = client.query(query).result().to_dataframe()
                record_count = df_results["row_count"].values[0]
                total_bytes = df_results["total_bytes"].values[0]
                supp_file_count = df_results["supp_file_count"].values[0]
                results.append([dataset_id, dataset_name, source_workspace, table_type, table_name, "All", "Table Metrics", "Count of Columns", column_count, 0, 0, "Success", ""])
                results.append([dataset_id, dataset_name, source_workspace, table_type, table_name, "All", "Table Metrics", "Count of Fileref Columns", fileref_count, 0, 0, "Success", ""])
                results.append([dataset_id, dataset_name, source_workspace, table_type, table_name, "All", "Table Metrics", "Count of Records", record_count, 0, 0, "Success", ""])
                if table_name == "anvil_file":
                    results.append([dataset_id, dataset_name, source_workspace, table_type, table_name, "All", "Table Metrics", "Sum of File Bytes (anvil_file)", total_bytes, 0, 0, "Success", ""])
                    results.append([dataset_id, dataset_name, source_workspace, table_type, table_name, "All", "Table Metrics", "Count of Supplementary Files", supp_file_count, 0, 0, "Success", ""])
            except Exception as e:
                err_msg = f"Error retrieving data from BigQuery: {str(e)}"
                results.append([dataset_id, dataset_name, source_workspace, table_type, table_name, "All", "Table Metrics", "Count of Columns", 0, 0, 0, "Error", err_msg])
                results.append([dataset_id, dataset_name, source_workspace, table_type, table_name, "All", "Table Metrics", "Count of Fileref Columns", 0, 0, 0, "Error", err_msg])
                results.append([dataset_id, dataset_name, source_workspace, table_type, table_name, "All", "Table Metrics", "Count of Records", 0, 0, 0, "Error", err_msg])
                if table_name == "anvil_file":
                    results.append([dataset_id, dataset_name, source_workspace, table_type, table_name, "All", "Table Metrics", "Sum of File Bytes (anvil_file)", 0, 0, 0, "Error", err_msg])
                    results.append([dataset_id, dataset_name, source_workspace, table_type, table_name, "All", "Table Metrics", "Count of Supplementary Files", 0, 0, 0, "Error", err_msg])  
            
            # Build and execute null column count query
            if record_count > 0:
                null_query = """WITH null_counts AS
                        (
                          SELECT column_name, COUNT(1) AS cnt
                          FROM `{project}.{dataset}.{table}`, 
                          UNNEST(REGEXP_EXTRACT_ALL(TO_JSON_STRING(`{project}.{dataset}.{table}`), r'"(\w+)":(?:null|\[\])')) column_name
                          GROUP BY column_name
                        ),
                        table_count AS
                        (
                          SELECT COUNT(*) AS cnt FROM `{project}.{dataset}.{table}`
                        )
                        SELECT 'Summary Stats' AS metric_type, src.table_name AS source_table, src.column_name AS source_column, 
                        'Count of nulls or empty lists in column' AS metric,
                        COALESCE(tar.cnt, 0) AS n, 
                        table_count.cnt AS d,
                        ROUND(CASE WHEN table_count.cnt > 0 THEN COALESCE(tar.cnt, 0)/table_count.cnt END, 2) AS r,
                        null AS flag
                        FROM `{project}.{dataset}.INFORMATION_SCHEMA.COLUMNS` src
                          LEFT JOIN null_counts tar ON src.column_name = tar.column_name
                          CROSS JOIN table_count
                        WHERE src.table_name = '{table}'
                        AND src.column_name NOT IN ('datarepo_row_id', 'datarepo_ingest_date')""".format(project = bq_project, dataset=bq_dataset, table=table_name)
                try:
                    df_result = client.query(null_query).result().to_dataframe()
                    for index, row in df_result.iterrows():
                        results.append([dataset_id, dataset_name, source_workspace, table_type, table_name, row["source_column"], "Column Metrics", row["metric"], row["n"], row["d"], row["r"], "Success", ""])
                except Exception as e:
                    err_msg = f"Error retrieving data from BigQuery: {str(e)}"
                    results.append([dataset_id, dataset_name, source_workspace, table_type, table_name, "All", "Column Metrics", "Count of nulls or empty lists in column", 0, 0, 0, "Error", err_msg])

                # Build and execute distinct column value query
                distinct_query = """WITH distinct_counts AS
                        (
                          SELECT column_name, APPROX_COUNT_DISTINCT(CASE WHEN column_value NOT IN ('null', '[]') THEN column_value END) AS cnt
                          FROM `{project}.{dataset}.{table}`,
                          UNNEST(REGEXP_EXTRACT_ALL(TO_JSON_STRING(`{project}.{dataset}.{table}`), r'"(\w+)":')) AS column_name WITH OFFSET pos1,
                          UNNEST(REGEXP_EXTRACT_ALL(TO_JSON_STRING(`{project}.{dataset}.{table}`), r':(.+?),')) AS column_value WITH OFFSET pos2
                          WHERE pos1 = pos2
                          GROUP BY column_name
                        ),
                        table_count AS
                        (
                          SELECT COUNT(*) AS cnt FROM `{project}.{dataset}.{table}`
                        )
                        SELECT 'Summary Stats' AS metric_type, src.table_name AS source_table, src.column_name AS source_column, 
                        'Count of distinct values in column' AS metric,
                        COALESCE(tar.cnt, 0) AS n, 
                        table_count.cnt AS d,
                        ROUND(CASE WHEN table_count.cnt > 0 THEN COALESCE(tar.cnt, 0)/table_count.cnt END, 2) AS r,
                        null AS flag
                        FROM `{project}.{dataset}.INFORMATION_SCHEMA.COLUMNS` src
                          LEFT JOIN distinct_counts tar ON src.column_name = tar.column_name
                          CROSS JOIN table_count
                        WHERE src.table_name = '{table}'
                        AND src.column_name NOT IN ('datarepo_row_id', 'datarepo_ingest_date')""".format(project = bq_project, dataset = bq_dataset, table = table_name)
                try:
                    df_result = client.query(distinct_query).result().to_dataframe()
                    for index, row in df_result.iterrows():
                        results.append([dataset_id, dataset_name, source_workspace, table_type, table_name, row["source_column"], "Column Metrics", row["metric"], row["n"], row["d"], row["r"], "Success", ""])
                except Exception as e:
                    err_msg = f"Error retrieving data from BigQuery: {str(e)}"
                    results.append([dataset_id, dataset_name, source_workspace, table_type, table_name, "All", "Column Metrics", "Count of distinct values in column", 0, 0, 0, "Error", err_msg])
            else:
                results.append([dataset_id, dataset_name, source_workspace, table_type, table_name, "All", "Column Metrics", "Count of nulls or empty lists in column", 0, 0, 0, "Success", "No records in table"])
                results.append([dataset_id, dataset_name, source_workspace, table_type, table_name, "All", "Column Metrics", "Count of distinct values in column", 0, 0, 0, "Success", "No records in table"])
    
        # Convert results for dataset
        results_df = pd.DataFrame(results, columns = ["dataset_id", "dataset_name", "orig_workspace", "table_type", "table_name", "column_name", "metric_type", "metric", "numerator", "denominator", "result", "status", "message"])
        results_df.sort_values(by=["dataset_name", "table_type", "table_name", "column_name", "metric_type", "metric"], inplace=True, ignore_index = True)

        # Write out results, if specified
        if write_out_results:
            client = bigquery.Client()
            job_config = bigquery.LoadJobConfig(
                schema=[
                    bigquery.SchemaField("dataset_id", bigquery.enums.SqlTypeNames.STRING),
                    bigquery.SchemaField("dataset_name", bigquery.enums.SqlTypeNames.STRING),
                    bigquery.SchemaField("orig_workspace", bigquery.enums.SqlTypeNames.STRING),
                    bigquery.SchemaField("table_type", bigquery.enums.SqlTypeNames.STRING),
                    bigquery.SchemaField("table_name", bigquery.enums.SqlTypeNames.STRING),
                    bigquery.SchemaField("column_name", bigquery.enums.SqlTypeNames.STRING),
                    bigquery.SchemaField("metric_type", bigquery.enums.SqlTypeNames.STRING),
                    bigquery.SchemaField("metric", bigquery.enums.SqlTypeNames.STRING),
                    bigquery.SchemaField("numerator", bigquery.enums.SqlTypeNames.INTEGER),
                    bigquery.SchemaField("denominator", bigquery.enums.SqlTypeNames.INTEGER),
                    bigquery.SchemaField("result", bigquery.enums.SqlTypeNames.FLOAT64),
                    bigquery.SchemaField("status", bigquery.enums.SqlTypeNames.STRING),
                    bigquery.SchemaField("message", bigquery.enums.SqlTypeNames.STRING),
                ],
                write_disposition="WRITE_APPEND"
            )
            job = client.load_table_from_dataframe(results_df, target_bigquery_table, job_config=job_config)
            job.result()
        
        # Add dataset results to aggregated results
        agg_results.extend(results)
    
    # Display results
    if display_results:
        print("\nFinal Results:")
        agg_results_df = pd.DataFrame(agg_results, columns = ["dataset_id", "dataset_name", "orig_workspace", "table_type", "table_name", "column_name", "metric_type", "metric", "numerator", "denominator", "result", "status", "message"])
        agg_results_df.sort_values(by=["dataset_name", "table_type", "table_name", "column_name", "metric_type", "metric"], inplace=True, ignore_index = True)
        display(agg_results_df)
    else:
        logging.info(f"Processing complete.")
        

#############################################
## Input Parameters
#############################################

# List of datasets to pull stats for
dataset_id_list = [
    'dbf5d87b-4cab-44cd-a792-1d0218aad973',
    '85dbde76-c130-40b2-8a8a-ba815ba499da',
    '1c2fe11d-b020-4c54-8c71-1ea91623d626',
    '84133066-68cd-41fa-819b-d74a3ac85862',
    '54c6fa73-9b84-4a3b-9e97-e4e43165c48b',
    '90ba1853-f845-4502-ba36-b75b9e571bc5',
    'f3c88c3c-8e1b-4af9-9467-0621404e314c',
    '332bb145-6ef1-40ef-932c-aec5bb6210d9',
    '44f83f20-d618-40b5-b2cb-3676b8fe3ad7',
    '3c2c39a9-4cc2-4f7c-89e0-054a871e2c4e',
    '2ebb722f-a3df-4ea4-b72a-813e3db0bab5',
    '1817528a-4f88-4ed6-8965-9eae0220ab27',
    'e03eb011-05f9-4491-b779-0cc2aefabff1',
    'd4bb7169-5a7d-4090-ba62-12ea799c3ade',
    '3f172982-060d-4339-a09b-6994c2c9eb16',
    '51789659-5233-4ee7-8bca-dedebfc87773',
    'a5fe75bb-d28c-42fb-aaf8-92fa37b266d2',
    '6545d602-e5b4-4dd1-8f6a-64e0a1952ddc',
    'e5c79b74-20d5-4b6f-8085-0bc788eed2ea',
    'f492567d-6db8-45c8-b44e-6b5def26c812',
    '12e54f96-00d9-4a38-921e-e0d42610b2c1',
    '845b131e-7c05-4397-ad40-23dea8e9b399',
    'bd492b71-b20e-4056-b8ae-ad8c94cfbc02',
    '9ecc231f-e3d3-4417-a98a-c4db4c638161',
    '3fb2d04a-d18b-4bdc-9372-99b992f2ae42',
    'e922a496-e686-4fa1-911d-2159ceb0f09f',
    '8fbfea50-6a71-4b19-98e9-f95e3a8594c7',
    'd911e57a-ebb8-4be8-876b-d8e5790ddce3',
    '6c9423a2-3ea7-4c3c-9b12-0cc993bc095f',
    '52e015b5-22b7-4a96-9f0a-ea3afccbfcbc',
    '325f3ee8-2adb-4092-bd78-1b5ea5b0d1d6',
    'ae50ef98-ef3d-4427-b094-83b2d90787a0',
    'e6771964-50e9-482f-9d23-18c22cd89ab8',
    '3fbacc64-4c53-4770-8cdf-a616c10ec5c7',
    'a3ae33bb-8b3a-47e5-a2d1-a49c954776b3',
    '0b0a52bb-a1a2-4638-9259-4447761c2da4',
    '0eb42259-7b44-450f-a9d7-500b2ea7179c',
    'e16adabb-88e0-4739-983a-98ac5c181842',
    '71f94dff-fbe8-4881-af1f-4987b67d5181',
    '5627cdbb-22a0-436f-a7a4-34d7ce21bb45',
    '0e65b131-fd14-4fce-908b-c5b89a71a9c1',
    'd56ae233-d6d2-483c-917e-1de0fe1cfeb7',
    'd00353de-f6f9-42d9-8a8f-f88b3d880dbf',
    '3be57453-9325-4c2e-b73a-832139b61778',
    'fcc60ac9-0d20-4a7c-97e4-e3c8d3aa8f76',
    'dc5f85d8-333b-4b68-b160-ad9856233887',
    '655e6a61-5400-4d8a-95bc-1506e026b289',
    '1f2d14d4-1bd8-46fc-9d35-1a415e5f326a',
    '64fd39fc-b32e-4b0a-8f83-4bf11b197462',
    'a77a2c65-38fe-4bf7-9ea6-0a2dc65eb21f',
    '25248cd8-2e98-4a83-9ccf-af7214fa71d6',
    '158ebecd-4596-4541-b832-a137232b7036',
    '1ccb95c3-1901-428e-b7bb-34495f41f4d2',
    '02ff1051-cd1d-4bbb-a005-21384cbff846',
    '0144b0d3-a809-46df-8c67-7ce42bdd579a',
    '35a1009d-93a2-49b1-a801-fe84d6b7a2f5',
    '50132478-c9fb-4dc5-86cd-d5dfab909393',
    '35064fc1-6c52-4005-8e99-cb0d6afd3f8c',
    '5cf859f6-990c-4b04-8609-35d5c57920f0',
    '62cfdce6-2d4d-415c-a11e-5ab60131c668',
    '2d07dd45-a263-440d-a339-9ccbab93aba8',
    'f1513955-0264-4733-bd25-3f752c61a323',
    '93e712f2-3e54-466e-aa53-57eb69c43bc0',
    '296f653a-91a8-4139-9bab-e6ae13afe99c',
    '633dc1aa-084d-43bd-9b17-bc6e57f81d48',
    '9320b3b5-3944-4bd2-913a-23b72bccd86c',
    '86ab4d3b-86ce-422b-ae6f-1ec6968a874d',
    '4124010f-7308-4831-80d7-ea14343249ab',
    '3037caeb-fa7a-4924-b399-7e4c7173b3b8',
    '146b72bc-1dcc-4e3b-bcda-d3dd25418012',
    '1d575e14-c3b1-4ead-a63b-a21c08c6a14d',
    'c5b1e333-7203-41ce-b8f7-3ef3a3bd721f',
    'bf519ea2-afe1-486a-9954-7362f10b6b60',
    '254ffffc-2bd6-4b2e-905b-a8c54c348cd0',
    'bb65d291-a673-4e4d-8a37-ab1f7401a902',
    '3a9604d7-456a-453d-a46b-40408624a07e',
    '2cbe079d-e7ab-47d8-836e-454a71440297',
    '84fad495-2756-472f-ad20-f91de6f67baf',
    '28e73469-12d4-493b-bf6f-83359c1f69c5',
    'db266afc-2f75-4b03-a3b8-c69e0ce6f713',
    'dd6866e4-8949-45bd-8910-8ce64f79e3c7',
    '12bbfa4c-c30a-4cf6-b79f-45354f842964',
    '84ac0d05-4be5-43e9-973e-ef999144d802',
    'ff8b1212-858a-4048-8f63-9464c922591a',
    'c814d754-cdc5-4b0d-8671-a39e85b2c473',
    '797b2563-5d56-4f5c-bdaf-3bfd11e8f5b3',
    '85287d84-fefe-40df-ad40-5b135ee0c07f',
    '7eeede5a-c86f-4577-9f3c-65ab618a6dee',
    'a52c04ee-cfef-46bb-9b40-6a9b292e1a7b',
    '0194eea9-d779-4957-8521-11717a378e66',
    '0d82658c-44b3-4cea-a388-3353a96a31ef',
    '77dca0d5-4d22-4415-8858-075590d25cb5',
    '1b05159b-6277-4345-9d59-f7bba5ea1d56',
    '92299ff4-c0d0-4e94-b374-75d0038cbd68',
    '60f96582-79ad-4461-9f9a-53c1bc3d17b6',
    'cc107de7-d623-464a-a875-c8b7ae5fb09d',
    'b5d7c34a-c383-4fc7-aa4d-b6dc941cd41a',
    '85baa8f8-619c-4165-9d3e-53220f645814',
    '3f278de3-f201-4344-9639-d35cd7a62adb',
    'c423b18d-12f2-43e4-97f9-993e2943270e',
    '416b8daa-9537-46db-ae7b-3f5ff5f01dc3',
    '61940344-e6c1-484e-ba10-131f43a9b13a',
    'ceee2791-0fdf-45fc-a4e8-8077916771aa',
    '5205f817-7de0-48b4-89fc-6398cf13bff7',
    '1d2f5472-ab6a-4a9b-ba53-520858cf79db',
    'f757278a-3c74-4690-bf89-5149d21ff3af',
    '5a103ab3-29c3-4d07-a0f6-4999c256cf26',
    '2a263db0-8c33-4171-840f-54bf4755a4b9',
    '9828f3fe-f676-4bf1-b600-5effa24ea9c8',
    '28849dc9-a97f-469b-b2ac-a8ff97693f02',
    '29cd0578-fb47-495a-8f48-b37325eed81a',
    'ba503d2e-48af-48bb-910a-be41790d921c',
    '472f01ad-7bc3-4fe5-9771-2695930dbc95',
    '956cd931-7077-4a08-9c75-ab8b4e5d1eb8',
    '31a42df0-29f6-4d4e-ae5c-2e13abc355f2',
    '13364604-ed08-4a61-89cd-65eb372ac8c3',
    'b724164c-712c-4615-97b7-529a108a753a',
    '22199347-9454-41e5-8912-eb38edd33a25',
    '60cadee3-9e63-4897-ac81-4fb283033648',
    'dcd4112f-09d3-43ed-8441-df9bf4c9ddc8',
    'da02c3f1-371b-4afe-9b5e-b8c584fd5907',
    'f6565f2f-4478-45ad-8c11-04dd242fc6a9',
    '275ea204-4612-4d3c-ac0d-f110f61d62ad',
    '72f73fc5-6a3a-43a0-8cce-09f4726b736c',
    'be72f1e3-b5f5-43f4-80db-6d7de93a654e',
    'b6bf4699-6f61-4c6a-9d42-ad055a0de008',
    '8abf299c-cd4e-4ce0-b5cf-4f9abe8cc891',
    '0b6eb077-2eca-4fe6-b012-26fab725b907',
    '5c659e81-e687-4710-a4fd-000ca593155d',
    '42965913-4223-484a-9b3d-abc0002d277d',
    '72e639d1-b8c5-45fd-9acd-a8e5e2b7fa0d',
    'd6823ccd-7247-4efc-8841-f53f456351ed',
    '1048a860-d5ff-4f61-95e5-851e1266d4c1',
    '8681cdcf-b775-4b56-aace-3f3e448261ef',
    'f0db3b27-c952-477d-bc33-9b96a250e168',
    '9e1a6a7d-b45e-4fd2-a1ff-df131da4c713',
    'd596ee91-481c-4eb5-9a8a-88c1e10ba9b6',
    '24470eb6-97c2-4cd4-b484-87a7d634c5b3',
    '3a781e70-cf6e-41c2-8d68-2326f16986e7',
    '048afc84-cdd2-4b39-8ea5-7351f4699761',
    '3fdcdafd-5328-418b-85f8-47b0006de468',
    '032d39fb-d278-427d-b7d2-de648a25a20c',
    '27acea14-41c9-4bf9-ad43-3ebb3ce90456',
    'ec6f49a2-176c-4564-82c5-e751baab46aa',
    '68ea655f-b4a3-43e2-95e4-f158ca2d67dd',
    '0447c960-bbfe-4e42-a95b-dd3d1d9a368e',
    'cc19d19e-6f7e-41b8-87a6-77f41d53e650',
    '16031a34-f1ba-4bde-af43-1822f1516944',
    '483d3454-54da-4243-bbeb-98cbf1d088d0',
    '6765ce2d-ebc8-4367-8855-c0f8e62cb355',
    '1b4a324a-5621-4399-85d2-f91aa03418b5',
    '09642596-d33a-4261-8bf7-eb1dbb37d572',
    '75119ed5-b8aa-4f45-bdef-e3c673bbe44c',
    '04a874df-c57b-40fc-9139-bc3a05129115',
    '2b8ad26a-e66e-4b03-a65a-5b504cecacfd',
    'a7226f10-bdba-4284-97b3-0738a5912770',
    '93b2ac60-2208-4ef8-a1c2-68a623e45807',
    'a963c15d-9c97-49e4-af95-cdee96333a76',
    'fb5d9952-ebe7-4ee6-ba00-819ed00f3593',
    '8da05494-fe7a-4af5-b257-bada143ee426',
    '8b098ab4-df02-4619-8ded-657e496695c1',
    'd48adc59-8934-41bb-9720-63e71f1933be',
    '8e88cabc-e713-44ed-a5d2-41935c3b4eb5',
    'be8cfc23-cd19-46fb-92e1-a77ac380d7aa',
    'f9224ea2-dd31-421d-80d4-f35082ef8d68',
    'e2a398ff-18c3-4258-9d75-89adb2923e88',
    'cb8ebcd0-bb5e-4a6d-bfef-5c651a1a9f6e',
    '6238f8f7-5efb-4023-8d85-ef7db9b4dad7',
    '32c09444-3d4a-44d5-af6b-07eef92189db',
    'd7686f98-05a4-45c9-af2e-3ebc524a5b2d',
    '1939b7ae-fc6b-42a8-ad5f-dc51a1682a17',
    '8ccefc59-38a5-476f-b7d3-3f98315a97f0',
    '2cda53ba-b852-47e8-8f24-59ab8e9f1d1f',
    '6e67e1e1-5c39-43da-960f-48385789c4e1',
    '92382848-f5e9-426c-b7dc-f2841ae97018',
    '4999a410-990e-484b-b4f3-d636f894a741',
    '1f534eb4-701f-4182-9895-64c5e5b52d82',
    'd01a4268-1bfe-4a2d-a2d4-e296162c406e',
    'feca4815-b44b-4b2b-8d77-75edd62ba5a6',
    '039dd3d6-0cb5-4cd1-86b3-e9579c9b5218',
    'd0ce8b95-9c3b-4f9e-8ce0-169fd89a8b20',
    '7427b2eb-a84f-413c-bfb0-7d2e36b0628f',
    '9d796a02-e2aa-4c15-b8d6-1e90cd736681',
    '28c3df75-0b08-4d5c-9feb-6e2e918572ea',
    '433e3a09-661a-46a5-96f2-dbb07bdc87f3',
    'f69c21e9-cb5f-4e72-acfe-c54b672a9f3b',
    '2ef4530a-cc36-4f32-9a1a-63a555346587',
    'e917c83d-c482-442d-81ce-869de7d20903',
    '10774229-1487-4188-b2c3-1fabcf85492a',
    '65793118-3c88-4185-9172-2354850e6056',
    'fcb03f4f-e685-4803-aadb-0e8940ff4f37',
    '46536136-08e4-4521-8e6c-67f023de020d',
    'c2f0e7cf-ac07-48f7-b5f1-497ee6c134b2',
    'b8e7fe18-9c3d-4cc0-bbc7-85b27197fc8f',
    '3abfc362-7e73-4663-9dcf-07b78b9aa2d4',
    'b60b4737-c646-4299-85a0-520890e830b7',
    '280c5d6f-39a3-4d1d-aad2-a174451cd9b2',
    '2d434f2c-6aaa-46b2-ada9-de4b887e13d3',
    'bad1fb5c-d263-48d7-8e4c-fa873a17d707',
    'e4ccd185-2b0c-445d-9c57-0dc45c8f9d7e',
    'd48db47e-acba-4377-b031-f6dfc21f3658',
    '3fd2204c-8654-4af7-832f-c186447262e0',
    'ae34e63e-13af-48b8-8b72-8137289091b3',
    'd3ed2595-b8be-40c8-b7b6-10a4997b9d2e',
    '575dc7da-58ed-407d-9e88-7b586f28bf65',
    '20ddfcd5-d456-431b-9f05-781e05d873d6',
    '15d41c35-943c-474b-afa6-e1c6d6e4be2b',
    '61803dc8-f649-43e5-ab15-d351f2cef629',
    'abe58d43-e1c7-4953-aa41-4d3b6f6cca44',
    '3ef7966a-ec1e-4dba-9d31-cdb33692e78f',
    'fa278604-7d85-4491-a30d-15c7821f8b00',
    'dd6c6688-b73a-464c-86d9-3369fdf98268',
    'b5c0bf91-9d20-41a2-9dd2-87d0ef0310f9',
    '97c636f9-0983-481f-8ff9-7b5b3ee6b10e',
    '15b153f5-ed02-4216-8f96-99743b8b4fc3',
    '747858c0-d139-4f52-9f0e-a618b880d6d6',
    '32bc49c6-7583-4613-a72f-5edb12b2a808',
    '3eb8ea77-4605-4bb7-90f9-671953abe4a2',
    '2b08cb76-061d-44c6-a00f-b43a5421df5e',
    'ab7e390a-adc5-4f9e-b317-a216a2904c93',
    'd1e6d0e4-d49e-4a16-93c6-7956b2c03414',
    '713f8676-8034-4827-bccc-cd6d95b1a4c4',
    'b00883d8-9251-435d-aefc-8a703d96d2fb',
    'eefbea02-0d65-441e-b455-35aa21d25ba3',
    'ff7e3be2-c0ac-4d97-85da-6229bf7585ac',
    '7ac92a42-e112-49c3-a8f5-8ad2c7ef5578',
    '0701aae2-8661-4eec-84e0-7c8be1c89a18',
    '7efb1905-34b4-4f1c-a8a6-8e64b3640a68',
    '11a2b088-8c1c-47d2-9c1e-455d457d2f05',
    '74608bd9-39e4-4f48-9b7c-1cd9d3c599c9',
    '7baf8e8c-de11-452d-b2e1-aad7c08cc18e',
    '23b0219d-0820-4017-b942-bda8578e90e2',
    'd7bcfc5d-e258-4bd6-a413-bb7a118e6bff',
    'a5f631ea-2b4b-43f2-9ea0-e31f2b11fa27',
    '8523489a-f57c-4993-81e4-1ed86a5c092d',
    '395da421-e6e8-4a26-ac93-eb7050a7cb1f',
    '9cb5ce25-38e8-4628-9ddf-d6aedf5efe0f',
    '615f6246-1c39-4e44-a9d4-c7133a2ae62d',
    'a647528d-925e-4c02-8825-ff54720c6ee4',
    '2c6f63b2-439e-499f-b687-b3fdd88a492e',
    '68a916af-2e0c-41bd-8535-c7eacbc2d1b7',
    '0e7f31a0-c712-4ebf-ab3a-64c37f43e52a',
    'c8b1d323-f352-482e-bf17-82075c23dcee',
    'd30f51c7-d642-4e7d-a168-967b9520a80a',
    '8d89608c-0d61-4d71-a2e3-9fbc6cda69bf',
    '71219f56-551f-4ad4-9a38-cc4aaf8a1e9a',
    '48dd6010-77dc-465b-a27c-695e29b57a5e',
    '3376a8b6-7ef6-4191-97ab-a547da0d330d',
    '21384132-1697-4e9b-b863-a6492d13285d',
    '582f5f8d-b96f-490e-b417-ba824baeb06c',
    '7e825ee6-7c03-43cc-b0a4-0d9203a30bd9',
    '2843292e-e494-4642-90e0-57e5c153f12c',
    '4ecbb7c8-0246-47f8-9654-4caca1d52565',
    '7e3ea1bd-95ba-4cad-90c8-3eec95be9cc8',
    'bbba696b-d023-4bb1-a213-c8bee31e8bae',
    '00bd45f9-beb2-4fb0-8680-bd30e392975a',
    'f85e467a-958f-4da5-a01b-8df883e69122',
    '470eee0f-2053-4d9b-9f5e-ca9661a6cc16',
    '5c1dc76d-b703-445c-9b38-cc2d00b9ab16',
    '0b25d09e-b2d9-4452-9810-1d0ef777f9d6',
    '608d793e-a78b-4872-a50c-21a9eaa60ec3',
    '74ede771-6781-4980-bfb9-5d853b7cdd6f',
    '6c47e282-5d5e-445c-b6bd-c0024946fbe0',
    '6ac178b7-a923-407f-8cd8-1733e1b2ebd5',
    'ff8ffbcf-c932-48c7-8d5e-d995d5680e21',
    '9d74b4f0-b2d4-46aa-867a-52fb6102bfdf',
    'e34f15f7-c225-4314-a638-90504bb0aa0d',
    'c1d222ab-bc0e-4e13-8379-0ee5be9e140e',
    '582187a5-ad63-4759-9162-55fa6337eb07',
    'd1e8d19a-970d-4ede-b5bc-9cab7237adec',
    'af867604-d801-41cc-9949-017eb30a0cbf',
    'c9dd3578-01db-4687-9807-4f71368941d1',
    '722e332c-fb1a-45fe-80c7-cc670f025b7f',
    '9f152896-ebf1-4756-b678-bdf739a92256',
    '478aa270-fbd4-4a45-8f63-221b4066168e',
    'e9c7ad29-2213-4648-9164-33a07bd42cdb',
    '1d140c76-a06b-42a0-bae8-b9e169ebe394',
    '5edcc3db-c676-412a-9506-600959bb81f2',
    '9f7dbe05-96b5-4b2f-9f3a-34b552e3dd21',
    'ccc524ab-d9ad-467c-a25b-9a14fb05e976',
    '9e3fb02d-dcf6-486f-a42d-89446a852057',
    '4b341ba9-49a5-43a2-9b7e-cc96beb59946',
    '15ae6390-6f6d-4fd8-9a51-ecf988676c4d',
    '3a3100bb-369e-47c1-a77c-2cacb7cf020d',
    '2c11b505-17c8-402e-8422-0239accb449d',
    'a6d7e030-e6c8-4c62-8cb5-165ef54987c4',
    'e25a8172-1e34-442c-a45d-583027a2d734',
    '0c18589c-6432-4a6c-90ce-985a47a66f39',
    'c911503c-f010-4c17-ac57-1d82e954bdc7',
    '487016d8-ea02-4b20-a45f-7382139aa865',
    '677f0bdf-6c5c-462b-8294-3666f777bbc5',
    '34da5c11-bbe8-4e55-8d89-9ef8a1c66200',
    '9a4d9d5f-72aa-4d7d-90f1-6d1181ee984c',
    'e6b15b39-daba-431f-a918-e4e43e702c30',
    'bef62e8a-5f5c-4e81-a8f8-ddeaf657b4e8',
    '128dce74-fa37-4f2f-8a80-d542edd81a11',
    '841970b7-bed0-4a75-a28a-a4cc59740a84',
    'a5f53fc8-8f9b-4e9a-af63-6f8c54d478b2',
    'f461fca1-80b2-4980-83a8-e165d49acc18',
    '37f0f1f9-83fb-49a1-9941-093c068c32d0',
    'cfb3dad7-c6d9-47c0-81b0-2133d75f5c0d',
    'c5c0893f-b254-4038-8d08-b28ef5a26b5d',
    'bbcf8529-1a04-43fc-b6cf-cb161028159d',
    '06421648-dfcb-4460-b93b-c7d6804dddbb',
    'e0b28b59-1cb5-44f4-ab8f-badf5c74f69f',
    '631deea0-2821-4d14-ad02-dc0ce4864924',
    '95788aa7-c897-4ae8-9166-4b8fc1fc5342',
    'eede320a-ed63-41d8-960d-5405a26a194f',
    '36dccf81-6932-43ae-9864-53379832d878',
    '9102024d-58c0-4bb9-aa55-12c00d98b6cd',
    '01eaf423-8cab-491a-b82e-6915dbc73594',
    '0481a135-9db1-424f-9065-a83ebd7ec995',
    'b60876c5-d825-4303-befb-ffff55b92aba',
    '49022563-1be1-4e42-a11c-01743cd5c94d',
    '64f2dbe1-6f58-493d-ab6b-c93568d828f4',
    'a9626803-72c2-4e23-968c-a090e3f22c5e',
    '095728d6-4ea1-4909-8a74-a8f3fa7f86cb',
    '8309cd89-a912-462a-90ad-f13ae0d7aa6c',
    '902596ce-714e-49b3-8271-f3dfece52309',
    'f5f29e4b-68f7-443c-b290-0827d4167fd5',
    '07c3a7f4-1e59-4dcb-a244-2fd3d084e2b0',
    '6f49717f-8f57-42d0-8548-316ecc292415',
    '7e693091-8ae4-4c40-8e66-c3b39f01b90e',
    '544f643d-b19f-4aa0-a6ec-a90e1a8681d6',
    '5137255f-0c58-4ac7-9266-bda8ab0247c2',
    '5243df74-712d-49a8-989b-528d15088e8f',
    '278a26cb-a710-4fff-928e-fc2e7084a75a',
    '58a1d168-8290-4c69-bf01-17ba3a084365',
    '9fc492f3-8d13-47ae-93e9-812c0224f1aa',
    '822d381e-cea0-45bb-8fa0-1b7194b4b64b',
    '2dc01a50-ea7b-4d9b-be57-1ffbdd98b27b',
    '5b6676dc-f46e-43a8-b87c-e431e369e53c',
    'eb35085f-0cbf-4829-a3ad-acaa53a250b5',
    '36fa2d20-622d-4cca-80b0-683672c94170',
    '44161b51-953d-4f6b-9448-5cba4a44a9d8',
    '0b90b2ea-8ca3-406a-9f69-95eddf7699ef',
    'bfb202bc-4078-4df0-82b9-9218dbc1f1a1',
    '8cc59f51-b0df-4a5d-a3c5-83ee526ff1af',
    'fda7c4b9-9f35-482b-9eff-be7f11058d94',
    '85646f4a-e424-4363-8033-1e7522e8f175',
    '8945794e-174a-49f9-a2d4-4242f9bf3833',
    '529343b4-698a-4b36-ac55-db8a6965ad3f',
    '0f949ee9-0986-42b2-af5d-0f4c8338c664',
    'cb1f06fa-b916-477d-8ab6-fb4b3f24efd3',
    '05253b3c-e8a3-4db4-8a6d-014eac7b3d94',
    '4807db90-b0f7-441d-b489-932f9b341f74',
    'c33b1f32-6021-4d1c-a4d5-fc3d501107f4',
    'aa314675-af62-41df-b5cb-3b22558e903b',
    '20741062-7d1d-44b7-bc33-39c9ad26e414',
    '69ce1be3-1815-43a4-bdd2-4696d9c8d09a',
    '76dd508c-aa80-4e54-9ac4-23b5e0545316',
    '1c6bef41-3cfa-46b2-b183-0a523e417457',
    '18716daf-4223-44a9-bba9-fc9baeef7d07',
    '475430c5-28cb-456d-9c5c-bdbfab9fafb2',
    'f0061cb3-688e-4ad4-aeb8-8614282292ec',
    '6905d8d1-da77-4f7c-86e5-3af7db2b00b4',
    '3a89c170-2939-4c12-9940-f32d96fa9e55',
    '9a06c401-da3f-41b4-b38b-238796fcae09',
    'b32d88c8-31e3-4789-a75f-e52bf1272937',
    '2a81cd6f-aa6e-436b-b4ba-68d5f713fb07',
    '5e0e8f9a-ce97-4b18-9540-3015c61e393c',
    '1c8ba244-1c7f-433a-825b-d2d34d018dcf',
    'efcdb584-7659-4780-9d6d-e6599fb0033c',
    '373ff2e8-0f63-4179-a55c-3fe0b85556aa',
    '352a503b-41eb-4a84-b257-68d70e55337e',
    '737d39b8-2f99-4eac-bcda-a03996e08939',
    '7577f264-8e84-440d-9346-7c4d5affda51',
    'febd8561-4769-4f3b-b7c0-ae7ff6ede2e9',
    'b8c5b185-8669-43d1-8ec7-c0f6d223d505',
    '31e61d00-61cc-46f2-a793-8ea8dfbb0832',
    '2355554e-8951-4b41-bcd8-32e18cddb7c9',
    'de1e7762-673f-4d44-8f45-7e693bb338b7',
    '239a484f-67c2-4ba3-a3d0-d6e4c2b27475',
    'af6c6f09-f0d2-46fe-bda0-c6fa5901c4a3',
    'c6f3bd64-ea67-488f-904f-f0bdf6320b5c',
    '166746e8-ce26-4fa1-a587-443ca9fc59a1',
    '80baf71d-28d0-4bca-81b7-49ddfadfa7a3',
    '6d18aafc-0240-499c-902e-a72a5b98ff0a',
    '263ab7c9-bd69-45dd-abb7-bbf35b9786ed',
    'e0c7877e-75d7-47d8-b5e9-5dd677d03353',
    '49a97523-0a7a-4d5a-ae20-496f86de2032',
    '583023a1-aa12-40e2-a964-8ad50ad400ba',
    'c56f0a76-2b91-4860-8dff-63c9504bb0e2',
    '17d3ffb4-e891-4ac6-a91a-fd52971c1115',
    '5203f051-7e84-4969-b4ce-eda56a859793',
    '488a38ee-f996-482d-a562-a4474f5594de',
    '680d748c-7c60-46e2-aea5-7fc557a916ea',
    '462d992a-7c13-45ac-a6da-1254fc3a9031',
    'dbb4df81-9115-45d1-b51d-875e0669edc4',
    '6fd0f009-3c34-4529-9a38-c59745545490',
    '74d1e549-5ae8-4410-9428-f8f2cc85fa80',
    '868f72af-99e8-406e-9f7e-14577e6c7157',
    '73f7d2b4-86ec-4f7e-a1f9-37c7b023e3bf',
    '595b6755-e7ae-4e83-af2e-693c089aeec3',
    '4d01e12e-503e-4447-8e49-8c2b77ffb00d',
    '732eaae3-b509-4a7a-8961-09d861e55253',
    '279e5670-8a47-4992-bb10-14e6c719db97',
    'd306000b-88c1-4220-8d7e-933c0118a983',
    '703c4bc2-81bf-435a-87fa-21dc9278bad6',
    'ab76b5ca-e464-4063-b949-853f61036370',
    '672b617f-936e-440a-a735-80f94798aed1',
    '516ceb43-1378-4c02-88fc-a1d2a2258d59',
    'c9986260-0c1b-4fd3-8132-6fa7353046e6',
    'bda2bec8-a142-47ab-bfb4-83759ac2bddd',
    'e858d4f9-3385-4640-b0cb-4894e86d501c',
    '39fe0c8b-bd78-4565-9415-63eabc1d6d85',
    'da4e904f-0346-4cd3-a5c2-ba932511d98d',
    'c46c2220-da88-4f60-a0cf-eebfd0a8ff12',
    '629e31cb-dd7b-4345-abf2-fa23c6c65a09',
    'e9a57082-5a93-481a-bbd0-1acb03ac751a',
    '0faf149d-b316-4fbd-8605-a59354f0eacd',
    '1d23d3cc-5db6-4734-bfaa-507dd366d99b',
    'dd58f556-0049-49c3-9a51-d6470a2abddc',
    '53185d06-f2cc-4942-88c4-8534b559a9ff',
    '0132f320-830d-40d0-a4da-06a5d5f9e8d9',
    'bb7d6408-941a-4da6-8613-36498bc6d91b',
    'b8d11ca1-3db8-4efa-bf57-0305e004a26d',
    '9bd56ad6-080e-4d26-acca-83e4df8aa913',
    '00c11c7e-8530-4bfc-abd7-8c10f4c602d3',
    '8f6b9e20-9468-4f46-aa45-eeab9de88e53',
    '272dff18-acf3-4874-a55f-ba8fb6f80352',
    '70ac3659-06bb-4022-be55-af81d3e35b6f',
    '409b92cf-5c4d-4997-9736-ef2ea10d19e9',
    'a8636719-e26c-49b6-9a53-7d77f3d3c94b',
    'a3e81d5f-8dd6-43dd-9172-d80d212efa2d',
    'd40af129-c13f-45b2-92f0-d0e8fa5cc1c9',
    'ecd2d2f9-2b6f-4743-8d04-c9bb554a96cb',
    '9ee78822-7acd-4fab-9999-c58e9fe266ad',
]

# Variable to write results out to a file
write_out_results = True
target_bigquery_table = "broad-dsde-prod-analytics-dev.anvil_inventory.table_and_column_stats"
display_results = False

#############################################
## Execution
#############################################

return_dataset_stats(dataset_id_list, write_out_results, target_bigquery_table, display_results)


## Snapshot Row Count Collection 

In [None]:
def return_row_counts(snapshot_id, results_list):
    # Grab access information from schema
    api_client = utils.refresh_tdr_api_client()
    snapshots_api = data_repo_client.SnapshotsApi(api_client=api_client)
    try:
        response = snapshots_api.retrieve_snapshot(id=snapshot_id, include=["TABLES", "ACCESS_INFORMATION"]).to_dict()
        tdr_schema_dict = {}
        tdr_schema_dict["tables"] = response["tables"]
        bq_project = response["access_information"]["big_query"]["project_id"]
        bq_dataset = response["access_information"]["big_query"]["dataset_name"]
    except:
        results_list.append([snapshot_id, 0])
        return results_list
    
    # Build row count query
    table_set = set()
    table_count = 0
    row_count_subquery = ""
    for table_entry in tdr_schema_dict["tables"]:
        table_set.add(table_entry["name"])
    for table_entry in table_set:
        table_count += 1
        if table_count == 1:
            row_count_subquery += "SELECT datarepo_row_id FROM `{project}.{dataset}.{table}` ".format(project=bq_project, dataset=bq_dataset, table=table_entry)
        else:
            row_count_subquery += "UNION ALL SELECT datarepo_row_id FROM `{project}.{dataset}.{table}` ".format(project=bq_project, dataset=bq_dataset, table=table_entry)
    row_count_query = "SELECT COUNT(*) AS row_count FROM ({subquery})".format(subquery=row_count_subquery)
    
    # Execute query and write results to results dict
    try:
        client = bigquery.Client()
        df_results = client.query(row_count_query).result().to_dataframe()
        row_count = df_results["row_count"].values[0]
        results_list.append([snapshot_id, row_count])
    except:
        results_list.append([snapshot_id, 0])
    return results_list
    
# Loop through snapshots and collect row counts
results_list = []
snapshot_id_list = [
'bb7eaad8-b02c-455c-964d-c9242019d9e5',
]
for snapshot_id in snapshot_id_list:
    results_list = return_row_counts(snapshot_id, results_list)
    
# Convert results to dataframe and display
results_df = pd.DataFrame(results_list, columns = ["snapshot_id", "row_count"])
display(results_df)


# Pulling Dataset Sizes Across AnVIL

## Pulling file counts and sizes from TDR (Total)

In [None]:
# Define parameters
billing_profile = "e0e03e48-5b96-45ec-baa4-8cc1ebf74c61"

# Establish API client
creds, project = google.auth.default()
auth_req = google.auth.transport.requests.Request()
creds.refresh(auth_req)
config = data_repo_client.Configuration()
config.host = "https://data.terra.bio"
config.access_token = creds.token
api_client = data_repo_client.ApiClient(configuration=config)
api_client.client_side_validation = False
datasets_api = data_repo_client.DatasetsApi(api_client=api_client)

# Loop through enumerated datasets and create records for those related to AnVIL
print(f"Start time: {datetime.datetime.now()}")
records_list = []
datasets_list = datasets_api.enumerate_datasets(limit=2000)
for dataset_entry in datasets_list.items:
    if dataset_entry.default_profile_id == billing_profile:
        # Retrieve dataset details and pull source workspace(s)
        dataset_details = datasets_api.retrieve_dataset(id=dataset_entry.id, include=["ACCESS_INFORMATION", "PROPERTIES"]).to_dict()
        try:
            source_workspace = ",".join(dataset_details["properties"]["source_workspaces"])
        except:
            source_workspace = ""
        
        # Pull data file size sum from BigQuery
        bq_project = dataset_details["access_information"]["big_query"]["project_id"]
        bq_schema = dataset_details["access_information"]["big_query"]["dataset_name"]
        client = bigquery.Client()
        file_size_query = """SELECT COUNT(*) AS file_count, COALESCE(SUM(size_in_bytes),0) AS file_size FROM `{project}.{schema}.file_inventory`""".format(project = bq_project, schema = bq_schema)
        try:
            df_output = client.query(file_size_query).result().to_dataframe()
            file_count = df_output["file_count"].values[0]
            byte_size = df_output["file_size"].values[0]
            status = "Success"
        except:
            file_count = 0
            byte_size = 0
            status = "Error"
    
        # Build record for dataset
        record = [dataset_entry.id, dataset_entry.name, source_workspace, file_count, byte_size, status]
        records_list.append(record)
        
# Read records into a dataframe
df = pd.DataFrame(records_list, columns =["Dataset UUID", "Dataset Name", "Source Workspaces", "File Count", "File Size (Bytes)", "Retrieval Status"])
df["File Size (Bytes)"] = df["File Size (Bytes)"].astype(int).astype(str)
df_sorted = df.sort_values(["Source Workspaces", "File Size (Bytes)"], ascending=[True, False], ignore_index=True)
print(f"End time: {datetime.datetime.now()}")
display(df_sorted)

## Pulling file counts and sizes from TDR (By Source Bucket)

In [None]:
# Define parameters
billing_profile = "e0e03e48-5b96-45ec-baa4-8cc1ebf74c61"

# Establish API client
creds, project = google.auth.default()
auth_req = google.auth.transport.requests.Request()
creds.refresh(auth_req)
config = data_repo_client.Configuration()
config.host = "https://data.terra.bio"
config.access_token = creds.token
api_client = data_repo_client.ApiClient(configuration=config)
api_client.client_side_validation = False
datasets_api = data_repo_client.DatasetsApi(api_client=api_client)

# Loop through enumerated datasets and create records for those related to AnVIL
print(f"Start time: {datetime.datetime.now()}")
records_list = []
datasets_list = datasets_api.enumerate_datasets(limit=2000)
for dataset_entry in datasets_list.items:
    if dataset_entry.default_profile_id == billing_profile:
        # Retrieve dataset details and pull source workspace(s)
        dataset_details = datasets_api.retrieve_dataset(id=dataset_entry.id, include=["ACCESS_INFORMATION", "PROPERTIES"]).to_dict()
        try:
            source_workspace = ",".join(dataset_details["properties"]["source_workspaces"])
        except:
            source_workspace = ""
        
        # Pull data file size sum from BigQuery
        bq_project = dataset_details["access_information"]["big_query"]["project_id"]
        bq_schema = dataset_details["access_information"]["big_query"]["dataset_name"]
        client = bigquery.Client()
        file_size_query = """SELECT REGEXP_SUBSTR(uri, "fc-[0-9a-z-]*", 1, 1) AS bucket, COUNT(*) AS file_count, COALESCE(SUM(size_in_bytes),0) AS file_size FROM `{project}.{schema}.file_inventory` GROUP BY bucket""".format(project = bq_project, schema = bq_schema)
        try:
            df_output = client.query(file_size_query).result().to_dataframe()
            for i in range(0, len(df_output)):
                source_bucket = df_output["bucket"].values[i]
                file_count = df_output["file_count"].values[i]
                byte_size = df_output["file_size"].values[i]
                status = "Success"
                record = [dataset_entry.id, dataset_entry.name, source_workspace, source_bucket, file_count, byte_size, status]
                records_list.append(record)
        except:
            source_bucket = ""
            file_count = 0
            byte_size = 0
            status = "Error"
            record = [dataset_entry.id, dataset_entry.name, source_workspace, source_bucket, file_count, byte_size, status]
            records_list.append(record)
        
# Read records into a dataframe
df = pd.DataFrame(records_list, columns =["Dataset UUID", "Dataset Name", "Source Workspaces", "Source Bucket", "File Count", "File Size (Bytes)", "Retrieval Status"])
df["File Size (Bytes)"] = df["File Size (Bytes)"].astype(int).astype(str)
df_sorted = df.sort_values(["Source Workspaces", "File Size (Bytes)"], ascending=[True, False], ignore_index=True)
print(f"End time: {datetime.datetime.now()}")
display(df_sorted)

## Pulling file counts and sizes from WS Buckets

In [None]:
# List of buckets:
bucket_list = [
'fc-secure-0075565e-7b76-4eaa-86e4-84a16acd7aba',
'fc-secure-cb321316-7166-4147-979c-5adf12904f30',
'fc-secure-8db2f74a-d4c8-47b9-8deb-bfadbf40ed18',
'fc-secure-276f37f8-c140-4502-9466-30a9be4e0e25',
'fc-secure-d408d9eb-a259-4598-b49d-fc3efafd13de',
'fc-secure-7e69c896-d6c0-4a4e-8490-42cb2d4fdebf',
'fc-secure-9e3357c0-389c-41d7-94ee-56673db6b75f',
'fc-secure-22d0b958-89a2-40be-91a5-efd0a24ccca6',
'fc-secure-320b42bf-eb47-4629-97e7-0ebaf188a091',
'fc-secure-fb033efd-49fe-4487-8b85-79c7b1c28384',
'fc-secure-45ba0648-12a1-4196-be6e-2a15ca834ca6',
'fc-secure-29c3d060-ca72-4de0-b87a-d45aa093ae1d',
'fc-secure-5b25667a-625b-4b0c-8ca2-b488dfce53c3',
'fc-secure-a9e585f8-4539-4d20-accf-d10790dd09d7',
'fc-secure-8ce36ffb-ad87-4942-abdc-2c0c6ce28483',
'fc-secure-356259df-0d87-4ad9-9cfd-0ef7947aeafc',
'fc-secure-87dd2b67-d7fc-49cb-8da9-eafa341cc1fb',
'fc-secure-be182c9d-e20a-43aa-b158-39113ea47705',
'fc-secure-3cbd4d3d-7331-46f9-a98f-ebba0a894562',
'fc-secure-905ccfc2-3a4d-4de7-8fe0-3ff6e1bc27ac',
'fc-secure-d0b94591-646e-4112-9640-9f8b688a222a',
'fc-secure-7cd273e4-2240-474d-aa8b-d02807b380e7',
'fc-secure-7171c5b1-2c83-4dfc-878a-f427ed7397f3',
'fc-secure-e58ec1b0-051d-4577-a85d-7c55ae2c0c51',
'fc-secure-2662b65b-4fec-48d0-bad8-e59e0349e581',
'fc-secure-3a248261-5349-4669-aa8e-9494ccb44c60',
'fc-secure-180323ab-f749-4063-ae83-3bb93c739046',
'fc-secure-bbac96b8-17df-4f33-9e42-5c9b6784e333',
'fc-secure-84b45515-60e8-4e08-9d0b-a960a153f66e',
'fc-secure-55d18a32-ae61-41e5-897a-846a95d97758',
'fc-secure-34427938-7ee7-44a8-9258-5b979d5a0c98',
'fc-db8d28e8-e27d-4c0c-8559-2ac15d4f82c9',
'fc-9ee5368b-50df-44ab-86ab-20d34db6bbcb',
'fc-97e826b0-7f75-4c91-9a42-955967e87a1a',
'fc-64b1886e-5c5b-4f3e-8518-6c4f0cff22b1',
]

# Loop through buckets and record size and file count
print(f"Start time: {datetime.datetime.now()}")
results = []
for bucket in bucket_list:
    start = time.time()
    obj_list = []
    file_count = 0
    size = 0
    try:
        storage_client = storage.Client()
        storage_bucket = storage_client.bucket(bucket, user_project="anvil-datastorage")
        objects = list(storage_client.list_blobs(storage_bucket))
        file_count = len(objects)
        for i in range(0,file_count): size += objects[i].size
        status = "Success"
        fail_message = ""
    except Exception as e:
        status = "Failure"
        fail_message = f"; Fail Message: {str(e)}"
    end = time.time()
    duration = round(end-start,2)
    message = f"Duration: {duration}s{fail_message}"
    results.append([bucket, size, file_count, status, message])
    df_temp = pd.DataFrame([[bucket, size, file_count, status, message]], columns =["Bucket", "Size in Bytes", "Object Count", "Run Status", "Message"])
    display(df_temp)
print("---------------------------------------------------------------------------------")
print("---------------------------------------------------------------------------------")
print("---------------------------------------------------------------------------------")  
df = pd.DataFrame(results, columns =["Bucket", "Size in Bytes", "Object Count", "Run Status", "Message"])
print(f"End time: {datetime.datetime.now()}")
display(df)

# Pulling MD5 Population Across AnVIL

## Pulling High Level MD5 Population Stats

In [None]:
# Define parameters
billing_profile = "e0e03e48-5b96-45ec-baa4-8cc1ebf74c61"

# Establish API client
creds, project = google.auth.default()
auth_req = google.auth.transport.requests.Request()
creds.refresh(auth_req)
config = data_repo_client.Configuration()
config.host = "https://data.terra.bio"
config.access_token = creds.token
api_client = data_repo_client.ApiClient(configuration=config)
api_client.client_side_validation = False
datasets_api = data_repo_client.DatasetsApi(api_client=api_client)

# Loop through enumerated datasets and create records for those related to AnVIL
print(f"Start time: {datetime.datetime.now()}")
records_list = []
datasets_list = datasets_api.enumerate_datasets(limit=2000)
for dataset_entry in datasets_list.items:
    if dataset_entry.default_profile_id == billing_profile:
        # Retrieve dataset details and pull source workspace(s)
        dataset_details = datasets_api.retrieve_dataset(id=dataset_entry.id, include=["ACCESS_INFORMATION", "PROPERTIES"]).to_dict()
        try:
            source_workspace = ",".join(dataset_details["properties"]["source_workspaces"])
        except:
            source_workspace = ""
        
        # Pull MD5 summary stats from TDR
        bq_project = dataset_details["access_information"]["big_query"]["project_id"]
        bq_schema = dataset_details["access_information"]["big_query"]["dataset_name"]
        client = bigquery.Client()
        file_size_query = """SELECT COUNT(*) AS file_count, COUNT(md5_hash) AS file_w_md5 FROM `{project}.{schema}.file_inventory`""".format(project = bq_project, schema = bq_schema)
        try:
            df_output = client.query(file_size_query).result().to_dataframe()
            file_count = df_output["file_count"].values[0]
            file_w_md5 = df_output["file_w_md5"].values[0]
            if file_count != file_w_md5:
                missing_md5 = True
            else:
                missing_md5 = False
            status = "Success"
        except:
            file_count = 0
            file_w_md5 = 0
            missing_md5 = False
            status = "Error"
    
        # Build record for dataset
        record = [dataset_entry.id, dataset_entry.name, source_workspace, file_count, file_w_md5, missing_md5, status]
        records_list.append(record)
        
# Read records into a dataframe
df = pd.DataFrame(records_list, columns =["Dataset UUID", "Dataset Name", "Source Workspaces", "File Count", "MD5 Populated Count", "Missing MD5s", "Retrieval Status"])
df_sorted = df.sort_values(["Source Workspaces"], ascending=[True], ignore_index=True)
print(f"End time: {datetime.datetime.now()}")
display(df_sorted)

# Pulling Specific Files Across AnVIL

## Pulling Specific Problematic Files

In [None]:
# Define parameters
billing_profile = "e0e03e48-5b96-45ec-baa4-8cc1ebf74c61"

# Establish API client
creds, project = google.auth.default()
auth_req = google.auth.transport.requests.Request()
creds.refresh(auth_req)
config = data_repo_client.Configuration()
config.host = "https://data.terra.bio"
config.access_token = creds.token
api_client = data_repo_client.ApiClient(configuration=config)
api_client.client_side_validation = False
datasets_api = data_repo_client.DatasetsApi(api_client=api_client)

# Loop through enumerated datasets and create records for those related to AnVIL
print(f"Start time: {datetime.datetime.now()}")

df_results = pd.DataFrame(columns = ["Dataset UUID", "Dataset Name", "Source Workspaces", "File Path", "Byte Size", "Retrieval Status"])
records_list = []
datasets_list = datasets_api.enumerate_datasets(limit=2000)
for dataset_entry in datasets_list.items:
    if dataset_entry.default_profile_id == billing_profile:
        # Retrieve dataset details and pull source workspace(s)
        dataset_details = datasets_api.retrieve_dataset(id=dataset_entry.id, include=["ACCESS_INFORMATION", "PROPERTIES"]).to_dict()
        try:
            source_workspace = ",".join(dataset_details["properties"]["source_workspaces"])
        except:
            source_workspace = ""
        
        # Pull data files with null MD5s
        bq_project = dataset_details["access_information"]["big_query"]["project_id"]
        bq_schema = dataset_details["access_information"]["big_query"]["dataset_name"]
        client = bigquery.Client()
        file_size_query = """SELECT uri AS file_name, size_in_bytes AS file_size FROM `{project}.{schema}.file_inventory` WHERE md5_hash IS NULL""".format(project = bq_project, schema = bq_schema)
        try:
            output = client.query(file_size_query).result()
            if output.total_rows > 0:
                df_output = output.to_dataframe()
                df_output.rename(columns = {"file_name":"File Path", "file_size":"Byte Size"}, inplace = True)
                df_output["Dataset UUID"] = dataset_entry.id
                df_output["Dataset Name"] = dataset_entry.name
                df_output["Source Workspaces"] = source_workspace
                df_output["Retrieval Status"] = "Success - Files Found"
                df_results = df_results.append(df_output)
            else:
                output = [[dataset_entry.id, dataset_entry.name, source_workspace, None, 0, "Success - No Files Found"]]
                df_output = pd.DataFrame(output, columns = ["Dataset UUID", "Dataset Name", "Source Workspaces", "File Path", "Byte Size", "Retrieval Status"])
                df_results = df_results.append(df_output)
        except:
            output = [[dataset_entry.id, dataset_entry.name, source_workspace, None, 0, "Error"]]
            df_output = pd.DataFrame(output, columns = ["Dataset UUID", "Dataset Name", "Source Workspaces", "File Path", "Byte Size", "Retrieval Status"])
            df_results = df_results.append(df_output)
        
# Sort dataframe records and write out to file
df_sorted = df_results.sort_values(["Source Workspaces", "File Path"], ascending=[True, True], ignore_index=True)
output_file_path = "null_md5_files.tsv"
df_sorted.to_csv(output_file_path, index=False, sep="\t")
!gsutil cp $output_file_path $ws_bucket/ingest_pipeline/resources/ 2> stdout
!rm $output_file_path
print(f"End time: {datetime.datetime.now()}")
print(f"Results copied to: {ws_bucket}/ingest_pipeline/resources/{output_file_path}")

# Examining Target Paths Across AnVIL

## Pulling Target Paths Across AnVIL and Looking for Embedded Buckets

In [None]:
# Define parameters
billing_profile = "e0e03e48-5b96-45ec-baa4-8cc1ebf74c61"

# Establish API client
creds, project = google.auth.default()
auth_req = google.auth.transport.requests.Request()
creds.refresh(auth_req)
config = data_repo_client.Configuration()
config.host = "https://data.terra.bio"
config.access_token = creds.token
api_client = data_repo_client.ApiClient(configuration=config)
api_client.client_side_validation = False
datasets_api = data_repo_client.DatasetsApi(api_client=api_client)

# Loop through enumerated datasets and create records for those related to AnVIL
print(f"Start time: {datetime.datetime.now()}")
records_list = []
datasets_list = datasets_api.enumerate_datasets(limit=2000)
for dataset_entry in datasets_list.items:
    if dataset_entry.default_profile_id == billing_profile:
        # Retrieve dataset details and pull source workspace(s)
        dataset_details = datasets_api.retrieve_dataset(id=dataset_entry.id, include=["ACCESS_INFORMATION", "PROPERTIES"]).to_dict()
        
        # Pull MD5 summary stats from TDR
        bq_project = dataset_details["access_information"]["big_query"]["project_id"]
        bq_schema = dataset_details["access_information"]["big_query"]["dataset_name"]
        client = bigquery.Client()
        file_size_query = """SELECT COUNT(*) AS file_count FROM `{project}.{schema}.datarepo_load_history` WHERE target_path LIKE '/fc-%'""".format(project = bq_project, schema = bq_schema)
        try:
            df_output = client.query(file_size_query).result().to_dataframe()
            if df_output["file_count"].values[0] > 0:
                bad_paths = True
            else:
                bad_paths = False
            status = "Success"
        except:
            bad_paths = False
            status = "Error"
    
        # Build record for dataset
        record = [dataset_entry.id, dataset_entry.name, bad_paths, status]
        records_list.append(record)
        
# Read records into a dataframe
df = pd.DataFrame(records_list, columns =["Dataset UUID", "Dataset Name", "Bad Target Paths", "Retrieval Status"])
df_sorted = df.sort_values(["Dataset Name"], ascending=[True], ignore_index=True)
print(f"End time: {datetime.datetime.now()}")
display(df_sorted)

# Looking for duplicate records across AnVIL

In [None]:
# Define parameters
billing_profile = "e0e03e48-5b96-45ec-baa4-8cc1ebf74c61"
target_table = "sample"
key_column = "sample_id"

# Establish API client
creds, project = google.auth.default()
auth_req = google.auth.transport.requests.Request()
creds.refresh(auth_req)
config = data_repo_client.Configuration()
config.host = "https://data.terra.bio"
config.access_token = creds.token
api_client = data_repo_client.ApiClient(configuration=config)
api_client.client_side_validation = False
datasets_api = data_repo_client.DatasetsApi(api_client=api_client)

# Loop through enumerated datasets and create records for those related to AnVIL
print(f"Start time: {datetime.datetime.now()}")
records_list = []
datasets_list = datasets_api.enumerate_datasets(limit=2000)
for dataset_entry in datasets_list.items:
    if dataset_entry.default_profile_id == billing_profile:
        # Retrieve dataset details and pull source workspace(s)
        dataset_details = datasets_api.retrieve_dataset(id=dataset_entry.id, include=["ACCESS_INFORMATION", "PROPERTIES"]).to_dict()
        
        # Pull duplicate records
        bq_project = dataset_details["access_information"]["big_query"]["project_id"]
        bq_schema = dataset_details["access_information"]["big_query"]["dataset_name"]
        client = bigquery.Client()
        dupe_query = """SELECT COUNT(*) AS dupe_count FROM (SELECT {pk_col} FROM `{project}.{schema}.{table}` GROUP BY {pk_col} HAVING COUNT(*) > 1)""".format(project = bq_project, schema = bq_schema, table = target_table, pk_col = key_column)
        try:
            df_output = client.query(dupe_query).result().to_dataframe()
            dupe_count = df_output["dupe_count"].values[0]
            if dupe_count > 0:
                duplicates_found = True
            else:
                duplicates_found = False
            status = "Success"
        except:
            duplicates_found = False
            status = "Error"
    
        # Build record for dataset
        record = [dataset_entry.id, dataset_entry.name, target_table, key_column, duplicates_found, dupe_query, status]
        records_list.append(record)
        
# Read records into a dataframe
df = pd.DataFrame(records_list, columns =["Dataset UUID", "Dataset Name", "Table Name", "Key Column", "Duplicates", "Query", "Status"])
df_sorted = df.sort_values(["Dataset Name"], ascending=[True], ignore_index=True)
print(f"End time: {datetime.datetime.now()}")
display(df_sorted)

# Looking for datasets with malformed file relationships

In [None]:
def validate_file_activities(dataset_id):
    
    # Retrieve dataset information
    src_schema_dict = {}
    api_client = utils.refresh_tdr_api_client()
    datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
    client = bigquery.Client()
    try:
        response = datasets_api.retrieve_dataset(id=dataset_id, include=["SCHEMA", "ACCESS_INFORMATION"]).to_dict()
        src_schema_dict["tables"] = response["schema"]["tables"]
        bq_project = response["access_information"]["big_query"]["project_id"]
        bq_dataset = response["access_information"]["big_query"]["dataset_name"]
    except Exception as e:
        return "Failure - Issue Retrieving Dataset Info"
    
    # Check files for duplicate names
    file_query = """
        SELECT COUNT(*) file_count, COUNT(DISTINCT file_name) AS distinct_file_names
        FROM `{project}.{dataset}.anvil_file`
        """.format(project=bq_project, dataset=bq_dataset)
    try:
        df = client.query(file_query).result().to_dataframe()
        if not df.empty:
            file_count = df["file_count"].values[0]
            distinct_name_count = df["distinct_file_names"].values[0] 
            if file_count == distinct_name_count:
                return "Success - All file names are distinct"
        else:
            return "Success - No files in dataset"  
    except Exception as e:
        return "Failure - BigQuery Error" 
    
    # Check activities
    activity_query = """
        WITH activity_flattened AS
        (
          SELECT DISTINCT generated_file, activity_type, used_file
          FROM `{project}.{dataset}.anvil_activity`
            CROSS JOIN UNNEST(used_file_id) AS used_file
            CROSS JOIN UNNEST(generated_file_id) AS generated_file
          WHERE ARRAY_LENGTH(used_biosample_id) = 0
        ),
        activity_agg AS
        (
          SELECT generated_file, activity_type, COUNT(DISTINCT used_file)
          FROM activity_flattened
          GROUP BY generated_file, activity_type
          HAVING COUNT(DISTINCT used_file) > 1
        )
        SELECT *
        FROM 
        (
          SELECT 'Files generated from multiple file activities (Activity Type - All)' AS metric, COUNT(DISTINCT generated_file) AS result 
          FROM activity_agg
          UNION ALL
          SELECT 'Files generated from multiple file activities (Activity Type - ' || activity_type || ')' AS metric, COUNT(DISTINCT generated_file) AS result 
          FROM activity_agg
          GROUP BY activity_type
        )
        ORDER BY metric
        """.format(project=bq_project, dataset=bq_dataset)
    try:
        df = client.query(activity_query).result().to_dataframe()
        if df.empty or (len(df) == 1 and df["result"].values[0] == 0):
            return "Success - No files generated from multiple file activities"
        else:
            records_json = json.loads(df.to_json(orient='records'))
            total_file_count = 0
            index_file_count = 0
            checksum_file_count = 0
            unknown_file_count = 0
            for record in records_json:
                if record["metric"] == "Files generated from multiple file activities (Activity Type - All)":
                    total_file_count = record["result"]
                elif record["metric"] == "Files generated from multiple file activities (Activity Type - Indexing)":
                    index_file_count = record["result"]
                elif record["metric"] == "Files generated from multiple file activities (Activity Type - Checksum)":
                    checksum_file_count = record["result"]
                else:
                    unknown_file_count = record["result"]
            err_msg = f"Failure - Files generated from multiple file activities. All: {str(total_file_count)} Indexing Activities: {str(index_file_count)} Checksum Activities: {str(checksum_file_count)} Unknown Activities: {str(unknown_file_count)}"
            return err_msg   
    except Exception as e:
        return "Failure - BigQuery Error" 

# Loop through datasets and validate is_supplementary field
dataset_id_list = [
    ''
]
results = []
for dataset_id in dataset_id_list:
    logging.info(f"Validating dataset_id: {dataset_id}")
    status = validate_file_activities(dataset_id) 
    results.append([dataset_id, status])
    results_df = pd.DataFrame(results, columns = ["dataset_id", "validation_status"])
display(results_df)

# Looking for anvil_activity records with malformed source_datarepo_row_ids

In [None]:
def validate_activities(dataset_id):
    
    # Retrieve dataset information
    src_schema_dict = {}
    api_client = utils.refresh_tdr_api_client()
    datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
    client = bigquery.Client()
    try:
        response = datasets_api.retrieve_dataset(id=dataset_id, include=["SCHEMA", "ACCESS_INFORMATION"]).to_dict()
        src_schema_dict["tables"] = response["schema"]["tables"]
        bq_project = response["access_information"]["big_query"]["project_id"]
        bq_dataset = response["access_information"]["big_query"]["dataset_name"]
    except Exception as e:
        return "Failure - Issue Retrieving Dataset Info"
    
    # Check activities for malformed source_datarepo_row_ids
    activities_query = """
        SELECT COUNT(*) AS bad_record_count
        FROM `{project}.{dataset}.anvil_activity`, UNNEST(source_datarepo_row_ids) AS source_datarepo_row_id
        WHERE source_datarepo_row_id NOT LIKE '%:%'
        """.format(project=bq_project, dataset=bq_dataset)
    try:
        df = client.query(activities_query).result().to_dataframe()
        if not df.empty:
            bad_record_count = df["bad_record_count"].values[0]
            if bad_record_count > 0:
                return "Failure - Malformed source_datarepo_row_ids detected"
            else:
                return "Success - No malformed source_datarepo_row_ids detected" 
        else:
            return "Success - No malformed source_datarepo_row_ids detected"  
    except Exception as e:
        return "Failure - BigQuery Error" 

# Loop through datasets and validate is_supplementary field
dataset_id_list = [
    ''
]
results = []
for dataset_id in dataset_id_list:
    logging.info(f"Validating dataset_id: {dataset_id}")
    status = validate_activities(dataset_id) 
    results.append([dataset_id, status])
    results_df = pd.DataFrame(results, columns = ["dataset_id", "validation_status"])
display(results_df)

# Examine source_datarepo_row_id Referential Integrity across AnVIL

In [None]:
#############################################
## Functions
#############################################

def validate_source_datarepo_row_ids(dataset_id, table, source_table):

    # Retrieve dataset information
    table_list = []
    api_client = utils.refresh_tdr_api_client()
    datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
    client = bigquery.Client()
    try:
        response = datasets_api.retrieve_dataset(id=dataset_id, include=["SCHEMA", "ACCESS_INFORMATION"]).to_dict()
        src_schema_tables = response["schema"]["tables"]
        for src_schema_table in src_schema_tables:
            table_list.append(src_schema_table["name"])
        bq_project = response["access_information"]["big_query"]["project_id"]
        bq_dataset = response["access_information"]["big_query"]["dataset_name"]
    except Exception as e:
        return "Failure", "Issue retrieving dataset info"
    
    # Check that tables to validate are both in the dataset
    if table not in table_list:
        return "Skipped", "Table not present in dataset"
    if source_table not in table_list:
        return "Skipped", "Source table not present in dataset"
    
    # Check for bad source_datarepo_row_ids linkage
    validation_query = f"""
        WITH datarepo_row_id_list 
        AS
        (
          SELECT DISTINCT REPLACE(src_datarepo_row_id, '{source_table}:', '') AS datarepo_row_id
          FROM `{bq_project}.{bq_dataset}.{table}` t1 
          CROSS JOIN UNNEST(source_datarepo_row_ids) AS src_datarepo_row_id
          WHERE src_datarepo_row_id LIKE '{source_table}%'
        )
        SELECT COUNT(*) AS dangling_row_id_count
        FROM datarepo_row_id_list
        WHERE datarepo_row_id NOT IN (SELECT datarepo_row_id FROM `{bq_project}.{bq_dataset}.{source_table}`)"""
    try:
        df = client.query(validation_query).result().to_dataframe()
        if not df.empty:
            dangling_row_id_count = df["dangling_row_id_count"].values[0]
            if dangling_row_id_count > 0:
                return "Failure", "Dangling source_datarepo_row_ids detected"
            else:
                return "Success", "No dangling source_datarepo_row_ids detected" 
        else:
            return "Success", "No dangling source_datarepo_row_ids detected"  
    except Exception as e:
        return "Failure", "BigQuery Error" 

#############################################
## Input Parameters
#############################################

# Input the relationships to examine:
relationship_list = [
    #["table", "source_table"]
    ["anvil_donor", "subject"],
    ["anvil_donor", "participant"],
    ["anvil_donor", "anvil_biosample"],
    ["anvil_biosample", "sample"],
    ["anvil_biosample", "subject"],
    ["anvil_biosample", "participant"],
    ["anvil_biosample", "anvil_sequencingactivity"],
    ["anvil_biosample", "anvil_activity"],
    ["anvil_diagnosis", "subject"],
    ["anvil_sequencingactivity", "sequencing"],
    ["anvil_file", "file_inventory"],
    ["anvil_dataset", "workspace_attributes"],
    ["anvil_project", "workspace_attributes"],
    ["anvil_activity", "sample"],
    ["anvil_activity", "file_inventory"],
    ["anvil_activity", "sample"],
    ["anvil_activity", "participant"],
    ["anvil_activity", "qc_result_sample"]  
]

# Input the datasets to examine:
dataset_id_list = [
    'cc19d19e-6f7e-41b8-87a6-77f41d53e650',
    '16031a34-f1ba-4bde-af43-1822f1516944',
    '6765ce2d-ebc8-4367-8855-c0f8e62cb355',
    '93b2ac60-2208-4ef8-a1c2-68a623e45807',
    'a963c15d-9c97-49e4-af95-cdee96333a76',
    'fb5d9952-ebe7-4ee6-ba00-819ed00f3593',
    'cb8ebcd0-bb5e-4a6d-bfef-5c651a1a9f6e',
    '1939b7ae-fc6b-42a8-ad5f-dc51a1682a17',
    '2cda53ba-b852-47e8-8f24-59ab8e9f1d1f',
    '4999a410-990e-484b-b4f3-d636f894a741',
    'd01a4268-1bfe-4a2d-a2d4-e296162c406e',
    '039dd3d6-0cb5-4cd1-86b3-e9579c9b5218',
    'd0ce8b95-9c3b-4f9e-8ce0-169fd89a8b20',
    '9d796a02-e2aa-4c15-b8d6-1e90cd736681',
    '433e3a09-661a-46a5-96f2-dbb07bdc87f3',
    '65793118-3c88-4185-9172-2354850e6056',
    'fcb03f4f-e685-4803-aadb-0e8940ff4f37',
    'c2f0e7cf-ac07-48f7-b5f1-497ee6c134b2',
    '3abfc362-7e73-4663-9dcf-07b78b9aa2d4',
    '8de6dae2-55ff-4287-9b75-5b2a950c1f44',
    'd48db47e-acba-4377-b031-f6dfc21f3658',
    '3fd2204c-8654-4af7-832f-c186447262e0',
    'ae34e63e-13af-48b8-8b72-8137289091b3',
    'd3ed2595-b8be-40c8-b7b6-10a4997b9d2e',
    '575dc7da-58ed-407d-9e88-7b586f28bf65',
    '61803dc8-f649-43e5-ab15-d351f2cef629',
    'abe58d43-e1c7-4953-aa41-4d3b6f6cca44',
    'fa278604-7d85-4491-a30d-15c7821f8b00',
    'b5c0bf91-9d20-41a2-9dd2-87d0ef0310f9',
    '97c636f9-0983-481f-8ff9-7b5b3ee6b10e',
    '32bc49c6-7583-4613-a72f-5edb12b2a808',
    '3eb8ea77-4605-4bb7-90f9-671953abe4a2',
    '2b08cb76-061d-44c6-a00f-b43a5421df5e',
    'ab7e390a-adc5-4f9e-b317-a216a2904c93',
    'd1e6d0e4-d49e-4a16-93c6-7956b2c03414',
    '713f8676-8034-4827-bccc-cd6d95b1a4c4',
    'b00883d8-9251-435d-aefc-8a703d96d2fb',
    'eefbea02-0d65-441e-b455-35aa21d25ba3',
    'ff7e3be2-c0ac-4d97-85da-6229bf7585ac',
    '7ac92a42-e112-49c3-a8f5-8ad2c7ef5578',
    '0701aae2-8661-4eec-84e0-7c8be1c89a18',
    '7efb1905-34b4-4f1c-a8a6-8e64b3640a68',
    '11a2b088-8c1c-47d2-9c1e-455d457d2f05',
    '74608bd9-39e4-4f48-9b7c-1cd9d3c599c9',
    '7baf8e8c-de11-452d-b2e1-aad7c08cc18e',
    '23b0219d-0820-4017-b942-bda8578e90e2',
    'a5f631ea-2b4b-43f2-9ea0-e31f2b11fa27',
    '8523489a-f57c-4993-81e4-1ed86a5c092d',
    '395da421-e6e8-4a26-ac93-eb7050a7cb1f',
    '9cb5ce25-38e8-4628-9ddf-d6aedf5efe0f',
    '615f6246-1c39-4e44-a9d4-c7133a2ae62d',
    'a647528d-925e-4c02-8825-ff54720c6ee4',
    '2c6f63b2-439e-499f-b687-b3fdd88a492e',
    '68a916af-2e0c-41bd-8535-c7eacbc2d1b7',
    '0e7f31a0-c712-4ebf-ab3a-64c37f43e52a',
    'c8b1d323-f352-482e-bf17-82075c23dcee',
    'd30f51c7-d642-4e7d-a168-967b9520a80a',
    '8d89608c-0d61-4d71-a2e3-9fbc6cda69bf',
    '71219f56-551f-4ad4-9a38-cc4aaf8a1e9a',
    '48dd6010-77dc-465b-a27c-695e29b57a5e',
    '21384132-1697-4e9b-b863-a6492d13285d',
    '582f5f8d-b96f-490e-b417-ba824baeb06c',
    '7e825ee6-7c03-43cc-b0a4-0d9203a30bd9',
    '2843292e-e494-4642-90e0-57e5c153f12c',
    '4ecbb7c8-0246-47f8-9654-4caca1d52565',
    '7e3ea1bd-95ba-4cad-90c8-3eec95be9cc8',
    'bbba696b-d023-4bb1-a213-c8bee31e8bae',
    '470eee0f-2053-4d9b-9f5e-ca9661a6cc16',
    '5c1dc76d-b703-445c-9b38-cc2d00b9ab16',
    '608d793e-a78b-4872-a50c-21a9eaa60ec3',
    '74ede771-6781-4980-bfb9-5d853b7cdd6f',
    '6c47e282-5d5e-445c-b6bd-c0024946fbe0',
    'ff8ffbcf-c932-48c7-8d5e-d995d5680e21',
    '9d74b4f0-b2d4-46aa-867a-52fb6102bfdf',
    'e34f15f7-c225-4314-a638-90504bb0aa0d',
    'c1d222ab-bc0e-4e13-8379-0ee5be9e140e',
    '582187a5-ad63-4759-9162-55fa6337eb07',
    'd1e8d19a-970d-4ede-b5bc-9cab7237adec',
    'af867604-d801-41cc-9949-017eb30a0cbf',
    '722e332c-fb1a-45fe-80c7-cc670f025b7f',
    '9f152896-ebf1-4756-b678-bdf739a92256',
    '478aa270-fbd4-4a45-8f63-221b4066168e',
    '0b06619d-39d9-4437-8c42-2e415faa634c',
    'e9c7ad29-2213-4648-9164-33a07bd42cdb',
    '1d140c76-a06b-42a0-bae8-b9e169ebe394',
    '9f7dbe05-96b5-4b2f-9f3a-34b552e3dd21',
    '9e3fb02d-dcf6-486f-a42d-89446a852057',
    '15ae6390-6f6d-4fd8-9a51-ecf988676c4d',
    '3a3100bb-369e-47c1-a77c-2cacb7cf020d',
    'e642bca0-52fb-4ab3-ab3a-acaab83deda7',
    '2c11b505-17c8-402e-8422-0239accb449d',
    'a6d7e030-e6c8-4c62-8cb5-165ef54987c4',
    'e25a8172-1e34-442c-a45d-583027a2d734',
    'c911503c-f010-4c17-ac57-1d82e954bdc7',
    '128dce74-fa37-4f2f-8a80-d542edd81a11',
    '841970b7-bed0-4a75-a28a-a4cc59740a84',
    'a5f53fc8-8f9b-4e9a-af63-6f8c54d478b2',
    'f461fca1-80b2-4980-83a8-e165d49acc18',
    '37f0f1f9-83fb-49a1-9941-093c068c32d0',
    'cfb3dad7-c6d9-47c0-81b0-2133d75f5c0d',
    'c5c0893f-b254-4038-8d08-b28ef5a26b5d',
    'bbcf8529-1a04-43fc-b6cf-cb161028159d',
    '631deea0-2821-4d14-ad02-dc0ce4864924',
    'eede320a-ed63-41d8-960d-5405a26a194f',
    '36dccf81-6932-43ae-9864-53379832d878',
    '01eaf423-8cab-491a-b82e-6915dbc73594',
    '0481a135-9db1-424f-9065-a83ebd7ec995',
    'b60876c5-d825-4303-befb-ffff55b92aba',
    '64f2dbe1-6f58-493d-ab6b-c93568d828f4',
    'a9626803-72c2-4e23-968c-a090e3f22c5e',
    '902596ce-714e-49b3-8271-f3dfece52309',
    'f5f29e4b-68f7-443c-b290-0827d4167fd5',
    '6f49717f-8f57-42d0-8548-316ecc292415',
    '7e693091-8ae4-4c40-8e66-c3b39f01b90e',
    '5137255f-0c58-4ac7-9266-bda8ab0247c2',
    '278a26cb-a710-4fff-928e-fc2e7084a75a',
    '9fc492f3-8d13-47ae-93e9-812c0224f1aa',
    '822d381e-cea0-45bb-8fa0-1b7194b4b64b',
    '44161b51-953d-4f6b-9448-5cba4a44a9d8',
    '0b90b2ea-8ca3-406a-9f69-95eddf7699ef',
    '8cc59f51-b0df-4a5d-a3c5-83ee526ff1af',
    '85646f4a-e424-4363-8033-1e7522e8f175',
    '76dd508c-aa80-4e54-9ac4-23b5e0545316',
    '1c6bef41-3cfa-46b2-b183-0a523e417457',
    '6905d8d1-da77-4f7c-86e5-3af7db2b00b4',
    '2a81cd6f-aa6e-436b-b4ba-68d5f713fb07',
    'efcdb584-7659-4780-9d6d-e6599fb0033c',
    '352a503b-41eb-4a84-b257-68d70e55337e',
    '737d39b8-2f99-4eac-bcda-a03996e08939',
    '31e61d00-61cc-46f2-a793-8ea8dfbb0832',
    'af6c6f09-f0d2-46fe-bda0-c6fa5901c4a3',
    '583023a1-aa12-40e2-a964-8ad50ad400ba',
    'c56f0a76-2b91-4860-8dff-63c9504bb0e2',
    '5203f051-7e84-4969-b4ce-eda56a859793',
    '488a38ee-f996-482d-a562-a4474f5594de',
    '73f7d2b4-86ec-4f7e-a1f9-37c7b023e3bf',
    '732eaae3-b509-4a7a-8961-09d861e55253',
    '279e5670-8a47-4992-bb10-14e6c719db97',
    '525a9535-74a3-4757-9507-52a684cf5647',
    'd306000b-88c1-4220-8d7e-933c0118a983',
    'ab76b5ca-e464-4063-b949-853f61036370',
    '672b617f-936e-440a-a735-80f94798aed1',
    '516ceb43-1378-4c02-88fc-a1d2a2258d59',
]

#############################################
## Execution
#############################################

results = []
for dataset_id in dataset_id_list:
    logging.info(f"Validating dataset_id: {dataset_id}")
    for relationship in relationship_list:
        table = relationship[0]
        source_table = relationship[1] 
        status, msg = validate_source_datarepo_row_ids(dataset_id, table, source_table) 
        results.append([dataset_id, table, source_table, status, msg])
print("Full results:")
results_df = pd.DataFrame(results, columns = ["dataset_id", "table", "source_table", "validation_status", "message"])
display(results_df)
failures_df = results_df[results_df["validation_status"].str.contains("Failure")]
failures_agg_df = failures_df.groupby('dataset_id')['table'].apply(set).reset_index()       
print("Aggregated results:")
display(failures_agg_df)


# Examine and validate file extensions

## Examine file extensions in AnVIL data

In [None]:
# Define parameters
billing_profile = "e0e03e48-5b96-45ec-baa4-8cc1ebf74c61"
dataset_id_list = [
    '4e699ead-bbb5-460d-9b32-2b1b322c601b',
    'a36eeaf7-d6dd-4887-bdbd-e435a07ba156',
    '8de6dae2-55ff-4287-9b75-5b2a950c1f44',
    'ce6692aa-0f97-48fa-8628-b8fa3eab4726',
    '31433635-91d4-431d-8d26-bc54e84c8e8c',
    '0b06619d-39d9-4437-8c42-2e415faa634c',
    '12ffb586-5f6a-4f0a-a353-d2f34599f4cc',
    'e642bca0-52fb-4ab3-ab3a-acaab83deda7',
    'b7fb531e-25a4-427c-9679-b7bdc3d03535',
    '3615e063-f24b-47f7-87cb-430e8aca8d0c',
    'a3ea4f97-6657-4d3c-9be6-96f097f5c952',
    '9f4ac69c-0919-4ac1-98a8-976ed79ace03',
    '96461004-f4b3-4f82-a842-293b3ec46a60',
    '841b7883-9447-4ea0-ae4a-84ea0240d919',
    'd0fc3d6a-c3f4-4533-8a23-817a4e27f9be',
    '2cace5dc-f660-45d4-b689-c4c89e77697c',
    'd2272f2d-c606-4027-b8ea-0bdd6d9d6535',
    'b12fb9be-2ce0-4bfd-8503-732fabba06ab',
    '9f9fc99a-b867-49a9-a3dc-8a39efbd5fa2',
    'ce58654d-b7d3-466b-99ba-b203d527a543',
    '179eb85e-2557-4677-9cba-d763310f3df9',
    'cba804c9-0bdd-4219-a53e-98c8db6334a0',
    'd239dd7b-8d10-4960-aa91-8f8ede641e25',
    '5c6a1c4f-ccd3-48a8-ac00-e18e5ecaa0bb',
    '19e2c8ab-853a-4204-86c3-f591125fbf63',
    '7cf0d3d0-f79b-4bfe-bfc8-e4e6c33dd4c3',
    '3a72e4b8-afb4-4299-98ec-a9ba9606be06',
    'f3c89298-0dd2-40da-8627-3baea553b34a',
    '9a32e23e-840d-4ba3-8cd9-392f48b8e9d2',
    'c5d967fd-09ce-4b02-97dd-ac3abf6f79fa',
    '5069fc2c-b957-4130-adca-6eabae943867',
    '173e56f7-b813-4c41-89ff-09a824e1407f',
    '80312f74-bd56-4938-96ba-e9bed95d1f3b',
    '017445d7-d56e-4e2e-b480-b4879b51e944',
    '13b2076a-cfe1-49ec-ac61-bad1af9a52ea',
    '175dd803-02c7-4823-81d5-9e0621652ace',
    '15492baa-05ed-47bc-b50c-e587679ae51a',
    '33705ce9-b2b3-4edc-9b47-f54283e193cf',
    '92486440-3a46-44dd-b853-b300ef75b31e',
    'dd2cb8fc-42a6-482f-898e-ef6125feccb8',
    '4e99b8e1-40b9-4fb2-90a0-d85e926ef31e',
    '128332b6-5060-4ec4-b6a6-f53b54a810be',
    '06f05f58-3c83-4f5c-bddd-bed7d2d1d147',
    '51e9935f-ec18-4832-801a-6d9186537572',
    'd6291444-8c3c-470c-b28c-7cf1d5c7aad8',
    '41cb9f29-4ba6-4690-821c-cb085e6b0f2f',
    'e68d1d39-99df-4cd7-8053-1b298f03eabb',
    'b252e3ac-4a8c-48e0-9999-5ee0c9a5842d',
    '7ea006d9-1e19-4678-b2e6-d4a1ea327f74',
    '34fd3b22-ac73-47d2-8849-5877158ec072',
    'a08dc7a6-f8ce-4205-95d2-83f614c2c32f',
    '577f36fe-8154-4c82-ac87-b2a64cb68f35',
    '7ce3270e-b2f2-47f4-a288-639751b2f87f',
    '36bdd59f-4f5b-43cd-8d34-a21ef87bbf30',
    '41d12dc1-8718-4439-b409-26cc23573107',
    'c4c49fcd-0c20-4cff-841a-cb58f5689c5b',
    '6b40557c-ddc3-4e7e-8a45-1761e7fcb8b5',
    'd6518df9-fc11-46ed-9c12-b9782d3829a0',
    '9ee2a552-89f8-4a48-9c94-9fa26ebb7483',
    '425412ba-894a-4824-acb8-bf18fe4576e0',
    'f22bd762-5c45-453e-bf22-b174514abb84',
    '0ee62643-b064-42f8-9b09-5d10eacd70a3',
    '1a7f6728-5116-4f24-897a-59a7f322cfd2',
    'c37b388c-7107-43d6-bee6-4e82b40ed271',
    'bf6f1d78-6a0d-4afb-aea6-17a3c34340db',
    'a3becdde-018b-46f0-adea-d587076eef4a',
    'a9ad3a05-24fb-4e59-85b0-ee09e55a4492',
    '719f7581-21db-4aec-8c46-4a5811832710',
    '3725b660-1106-4173-9c4b-0a15926becf5',
    '318a75f4-ac50-4944-81b0-70a1323e7497',
    '75fb0984-2124-444f-881b-30a1a6f8b8f7',
    '15be288e-53e1-41cb-8d20-8ea87efb9258',
    '700303c2-fcef-48a5-9900-096bf34e2d83',
    '38fd20ce-affd-4791-9810-7f5a7fe876d0',
    '8b8185d3-ba5c-4832-af23-3ff8ca6ed016',
    '140797da-dc94-4fc2-8b0b-f2e1dec7bd43',
    'ec97fa0f-e174-40fe-a6b8-ee240bdf4318',
    '5488d7c1-5195-4ebc-b0f0-31033fa06dc9',
    '56f9888f-e623-4a1a-b2b4-46378a6cd6fe',
    'b2e7f15b-65d5-4812-abfd-b2dbc6d18850',
    '69f8d7c2-2e14-48e6-b838-7881016313fb',
    '809fa952-3178-46b7-bb82-8a476ef32e67',
    'dfb14a1f-38b2-4668-b98f-59b5b5b53ca3',
    'f553b765-1c9c-464c-a8fa-07700a1691c5',
    'b108dfd0-711d-4bc1-aab5-1b312226c8ad',
    '38eca26c-d79e-4447-99d3-1889d20ade21',
    'd5a0e24d-689a-4854-92c7-9a39f980b523',
    'ecd0606f-4fa7-4e57-b6e8-eea377e65d5b',
    '0c6bc810-5ae0-4926-a56b-2bc2fe7dbe6d',
    '9dc31133-c882-4f39-903a-a25f316bb560',
    '23a0ede0-4f97-46af-9f04-bd2805050980',
    '7593c1c2-3680-4bf5-8a65-dce5f96a3b59',
    '02661394-2886-4ef7-aff1-d53225c82025',
    'c1644d4e-06e2-4fa8-95f1-5c1da5831257',
    '267cf516-dd33-4640-a71a-78bd8f5db9d8',
    'df06ff22-6a2d-4934-aac9-c8368efbea1a',
    'dcdefb14-f6de-4c46-ac7e-842b273416bf',
    '28208cc6-50bf-4864-9a48-981632066640',
    'ccfe264d-a35e-44f8-9b2b-241a0f8327cc',
    'f177843d-47fb-46ae-83be-73c92ee85081',
    'd049d487-1a69-4358-8dad-0e6fa6c06fdd',
    'da29226b-e856-4014-8c8b-c4268d0df2cc',
    '714dcdbd-8d17-40b0-8246-0e941af8175d',
    '53ce7d12-facc-4412-a710-f535efb209a2',
    'b9842819-5fd6-40c5-9668-aae1ea44a308',
    '7c056125-3ed8-459c-b73e-edfa3f80cc27',
    'ed1215f1-787c-40f5-9d77-4b5bc2dfbb84',
    '7feeb2b1-1926-4968-b6dc-e0a1e4cf8d4a',
    'dbf5d87b-4cab-44cd-a792-1d0218aad973',
    '85dbde76-c130-40b2-8a8a-ba815ba499da',
    '1c2fe11d-b020-4c54-8c71-1ea91623d626',
    '84133066-68cd-41fa-819b-d74a3ac85862',
    '54c6fa73-9b84-4a3b-9e97-e4e43165c48b',
    '90ba1853-f845-4502-ba36-b75b9e571bc5',
    'f3c88c3c-8e1b-4af9-9467-0621404e314c',
    '332bb145-6ef1-40ef-932c-aec5bb6210d9',
    '44f83f20-d618-40b5-b2cb-3676b8fe3ad7',
    '3c2c39a9-4cc2-4f7c-89e0-054a871e2c4e',
    '2ebb722f-a3df-4ea4-b72a-813e3db0bab5',
    '1817528a-4f88-4ed6-8965-9eae0220ab27',
    'e03eb011-05f9-4491-b779-0cc2aefabff1',
    'd4bb7169-5a7d-4090-ba62-12ea799c3ade',
    '3f172982-060d-4339-a09b-6994c2c9eb16',
    '51789659-5233-4ee7-8bca-dedebfc87773',
    'a5fe75bb-d28c-42fb-aaf8-92fa37b266d2',
    '6545d602-e5b4-4dd1-8f6a-64e0a1952ddc',
    'e5c79b74-20d5-4b6f-8085-0bc788eed2ea',
    'f492567d-6db8-45c8-b44e-6b5def26c812',
    '12e54f96-00d9-4a38-921e-e0d42610b2c1',
    '845b131e-7c05-4397-ad40-23dea8e9b399',
    'bd492b71-b20e-4056-b8ae-ad8c94cfbc02',
    '9ecc231f-e3d3-4417-a98a-c4db4c638161',
    '3fb2d04a-d18b-4bdc-9372-99b992f2ae42',
    'e922a496-e686-4fa1-911d-2159ceb0f09f',
    '8fbfea50-6a71-4b19-98e9-f95e3a8594c7',
    'd911e57a-ebb8-4be8-876b-d8e5790ddce3',
    '6c9423a2-3ea7-4c3c-9b12-0cc993bc095f',
    '52e015b5-22b7-4a96-9f0a-ea3afccbfcbc',
    '325f3ee8-2adb-4092-bd78-1b5ea5b0d1d6',
    'ae50ef98-ef3d-4427-b094-83b2d90787a0',
    'e6771964-50e9-482f-9d23-18c22cd89ab8',
    '3fbacc64-4c53-4770-8cdf-a616c10ec5c7',
    'a3ae33bb-8b3a-47e5-a2d1-a49c954776b3',
    '0b0a52bb-a1a2-4638-9259-4447761c2da4',
    '0eb42259-7b44-450f-a9d7-500b2ea7179c',
    'e16adabb-88e0-4739-983a-98ac5c181842',
    '71f94dff-fbe8-4881-af1f-4987b67d5181',
    '5627cdbb-22a0-436f-a7a4-34d7ce21bb45',
    '0e65b131-fd14-4fce-908b-c5b89a71a9c1',
    'd56ae233-d6d2-483c-917e-1de0fe1cfeb7',
    'd00353de-f6f9-42d9-8a8f-f88b3d880dbf',
    '3be57453-9325-4c2e-b73a-832139b61778',
    'fcc60ac9-0d20-4a7c-97e4-e3c8d3aa8f76',
    'dc5f85d8-333b-4b68-b160-ad9856233887',
    '655e6a61-5400-4d8a-95bc-1506e026b289',
    '1f2d14d4-1bd8-46fc-9d35-1a415e5f326a',
    '64fd39fc-b32e-4b0a-8f83-4bf11b197462',
    'a77a2c65-38fe-4bf7-9ea6-0a2dc65eb21f',
    '25248cd8-2e98-4a83-9ccf-af7214fa71d6',
    '158ebecd-4596-4541-b832-a137232b7036',
    '1ccb95c3-1901-428e-b7bb-34495f41f4d2',
    '02ff1051-cd1d-4bbb-a005-21384cbff846',
    '0144b0d3-a809-46df-8c67-7ce42bdd579a',
    '35a1009d-93a2-49b1-a801-fe84d6b7a2f5',
    '50132478-c9fb-4dc5-86cd-d5dfab909393',
    '35064fc1-6c52-4005-8e99-cb0d6afd3f8c',
    '5cf859f6-990c-4b04-8609-35d5c57920f0',
    '62cfdce6-2d4d-415c-a11e-5ab60131c668',
    '2d07dd45-a263-440d-a339-9ccbab93aba8',
    'f1513955-0264-4733-bd25-3f752c61a323',
    '93e712f2-3e54-466e-aa53-57eb69c43bc0',
    '296f653a-91a8-4139-9bab-e6ae13afe99c',
    '633dc1aa-084d-43bd-9b17-bc6e57f81d48',
    '9320b3b5-3944-4bd2-913a-23b72bccd86c',
    '86ab4d3b-86ce-422b-ae6f-1ec6968a874d',
    '4124010f-7308-4831-80d7-ea14343249ab',
    '3037caeb-fa7a-4924-b399-7e4c7173b3b8',
    '146b72bc-1dcc-4e3b-bcda-d3dd25418012',
    '1d575e14-c3b1-4ead-a63b-a21c08c6a14d',
    'c5b1e333-7203-41ce-b8f7-3ef3a3bd721f',
    'bf519ea2-afe1-486a-9954-7362f10b6b60',
    '254ffffc-2bd6-4b2e-905b-a8c54c348cd0',
    'bb65d291-a673-4e4d-8a37-ab1f7401a902',
    '3a9604d7-456a-453d-a46b-40408624a07e',
    '2cbe079d-e7ab-47d8-836e-454a71440297',
    '84fad495-2756-472f-ad20-f91de6f67baf',
    '28e73469-12d4-493b-bf6f-83359c1f69c5',
    'db266afc-2f75-4b03-a3b8-c69e0ce6f713',
    'dd6866e4-8949-45bd-8910-8ce64f79e3c7',
    '12bbfa4c-c30a-4cf6-b79f-45354f842964',
    '84ac0d05-4be5-43e9-973e-ef999144d802',
    'ff8b1212-858a-4048-8f63-9464c922591a',
    'c814d754-cdc5-4b0d-8671-a39e85b2c473',
    '797b2563-5d56-4f5c-bdaf-3bfd11e8f5b3',
    '85287d84-fefe-40df-ad40-5b135ee0c07f',
    '7eeede5a-c86f-4577-9f3c-65ab618a6dee',
    'a52c04ee-cfef-46bb-9b40-6a9b292e1a7b',
    '0194eea9-d779-4957-8521-11717a378e66',
    '0d82658c-44b3-4cea-a388-3353a96a31ef',
    '77dca0d5-4d22-4415-8858-075590d25cb5',
    '1b05159b-6277-4345-9d59-f7bba5ea1d56',
    '92299ff4-c0d0-4e94-b374-75d0038cbd68',
    '60f96582-79ad-4461-9f9a-53c1bc3d17b6',
    'cc107de7-d623-464a-a875-c8b7ae5fb09d',
    'b5d7c34a-c383-4fc7-aa4d-b6dc941cd41a',
    '85baa8f8-619c-4165-9d3e-53220f645814',
    '3f278de3-f201-4344-9639-d35cd7a62adb',
    'c423b18d-12f2-43e4-97f9-993e2943270e',
    '416b8daa-9537-46db-ae7b-3f5ff5f01dc3',
    '61940344-e6c1-484e-ba10-131f43a9b13a',
    'ceee2791-0fdf-45fc-a4e8-8077916771aa',
    '5205f817-7de0-48b4-89fc-6398cf13bff7',
    '1d2f5472-ab6a-4a9b-ba53-520858cf79db',
    'f757278a-3c74-4690-bf89-5149d21ff3af',
    '5a103ab3-29c3-4d07-a0f6-4999c256cf26',
    '2a263db0-8c33-4171-840f-54bf4755a4b9',
    '9828f3fe-f676-4bf1-b600-5effa24ea9c8',
    '28849dc9-a97f-469b-b2ac-a8ff97693f02',
    '29cd0578-fb47-495a-8f48-b37325eed81a',
    'ba503d2e-48af-48bb-910a-be41790d921c',
    '472f01ad-7bc3-4fe5-9771-2695930dbc95',
    '956cd931-7077-4a08-9c75-ab8b4e5d1eb8',
    '31a42df0-29f6-4d4e-ae5c-2e13abc355f2',
    '13364604-ed08-4a61-89cd-65eb372ac8c3',
    'b724164c-712c-4615-97b7-529a108a753a',
    '22199347-9454-41e5-8912-eb38edd33a25',
    '60cadee3-9e63-4897-ac81-4fb283033648',
    'dcd4112f-09d3-43ed-8441-df9bf4c9ddc8',
    'da02c3f1-371b-4afe-9b5e-b8c584fd5907',
    'f6565f2f-4478-45ad-8c11-04dd242fc6a9',
    '275ea204-4612-4d3c-ac0d-f110f61d62ad',
    '72f73fc5-6a3a-43a0-8cce-09f4726b736c',
    'be72f1e3-b5f5-43f4-80db-6d7de93a654e',
    'b6bf4699-6f61-4c6a-9d42-ad055a0de008',
    '8abf299c-cd4e-4ce0-b5cf-4f9abe8cc891',
    '0b6eb077-2eca-4fe6-b012-26fab725b907',
    '5c659e81-e687-4710-a4fd-000ca593155d',
    '42965913-4223-484a-9b3d-abc0002d277d',
    '72e639d1-b8c5-45fd-9acd-a8e5e2b7fa0d',
    'd6823ccd-7247-4efc-8841-f53f456351ed',
    '1048a860-d5ff-4f61-95e5-851e1266d4c1',
    '8681cdcf-b775-4b56-aace-3f3e448261ef',
    'f0db3b27-c952-477d-bc33-9b96a250e168',
    '9e1a6a7d-b45e-4fd2-a1ff-df131da4c713',
    'd596ee91-481c-4eb5-9a8a-88c1e10ba9b6',
    '24470eb6-97c2-4cd4-b484-87a7d634c5b3',
    '3a781e70-cf6e-41c2-8d68-2326f16986e7',
    '048afc84-cdd2-4b39-8ea5-7351f4699761',
    '3fdcdafd-5328-418b-85f8-47b0006de468',
    '032d39fb-d278-427d-b7d2-de648a25a20c',
    '27acea14-41c9-4bf9-ad43-3ebb3ce90456',
    '68ea655f-b4a3-43e2-95e4-f158ca2d67dd',
    '0447c960-bbfe-4e42-a95b-dd3d1d9a368e',
    'cc19d19e-6f7e-41b8-87a6-77f41d53e650',
    '16031a34-f1ba-4bde-af43-1822f1516944',
    '483d3454-54da-4243-bbeb-98cbf1d088d0',
    '6765ce2d-ebc8-4367-8855-c0f8e62cb355',
    '1b4a324a-5621-4399-85d2-f91aa03418b5',
    '09642596-d33a-4261-8bf7-eb1dbb37d572',
    '75119ed5-b8aa-4f45-bdef-e3c673bbe44c',
    '04a874df-c57b-40fc-9139-bc3a05129115',
    '2b8ad26a-e66e-4b03-a65a-5b504cecacfd',
    'a7226f10-bdba-4284-97b3-0738a5912770',
    '93b2ac60-2208-4ef8-a1c2-68a623e45807',
    'a963c15d-9c97-49e4-af95-cdee96333a76',
    'fb5d9952-ebe7-4ee6-ba00-819ed00f3593',
    '8da05494-fe7a-4af5-b257-bada143ee426',
    '8b098ab4-df02-4619-8ded-657e496695c1',
    'd48adc59-8934-41bb-9720-63e71f1933be',
    '8e88cabc-e713-44ed-a5d2-41935c3b4eb5',
    'be8cfc23-cd19-46fb-92e1-a77ac380d7aa',
    'f9224ea2-dd31-421d-80d4-f35082ef8d68',
    'e2a398ff-18c3-4258-9d75-89adb2923e88',
    'cb8ebcd0-bb5e-4a6d-bfef-5c651a1a9f6e',
    '6238f8f7-5efb-4023-8d85-ef7db9b4dad7',
    '32c09444-3d4a-44d5-af6b-07eef92189db',
    'd7686f98-05a4-45c9-af2e-3ebc524a5b2d',
    '1939b7ae-fc6b-42a8-ad5f-dc51a1682a17',
    '8ccefc59-38a5-476f-b7d3-3f98315a97f0',
    '2cda53ba-b852-47e8-8f24-59ab8e9f1d1f',
    '6e67e1e1-5c39-43da-960f-48385789c4e1',
    '92382848-f5e9-426c-b7dc-f2841ae97018',
    '4999a410-990e-484b-b4f3-d636f894a741',
    '1f534eb4-701f-4182-9895-64c5e5b52d82',
    'd01a4268-1bfe-4a2d-a2d4-e296162c406e',
    'feca4815-b44b-4b2b-8d77-75edd62ba5a6',
    '039dd3d6-0cb5-4cd1-86b3-e9579c9b5218',
    'd0ce8b95-9c3b-4f9e-8ce0-169fd89a8b20',
    '7427b2eb-a84f-413c-bfb0-7d2e36b0628f',
    '9d796a02-e2aa-4c15-b8d6-1e90cd736681',
    '28c3df75-0b08-4d5c-9feb-6e2e918572ea',
    '433e3a09-661a-46a5-96f2-dbb07bdc87f3',
    'f69c21e9-cb5f-4e72-acfe-c54b672a9f3b',
    '2ef4530a-cc36-4f32-9a1a-63a555346587',
    'e917c83d-c482-442d-81ce-869de7d20903',
    '10774229-1487-4188-b2c3-1fabcf85492a',
    '65793118-3c88-4185-9172-2354850e6056',
    'fcb03f4f-e685-4803-aadb-0e8940ff4f37',
    '46536136-08e4-4521-8e6c-67f023de020d',
    'c2f0e7cf-ac07-48f7-b5f1-497ee6c134b2',
    'b8e7fe18-9c3d-4cc0-bbc7-85b27197fc8f',
    '3abfc362-7e73-4663-9dcf-07b78b9aa2d4',
    'b60b4737-c646-4299-85a0-520890e830b7',
    '280c5d6f-39a3-4d1d-aad2-a174451cd9b2',
    '2d434f2c-6aaa-46b2-ada9-de4b887e13d3',
    'bad1fb5c-d263-48d7-8e4c-fa873a17d707',
    'e4ccd185-2b0c-445d-9c57-0dc45c8f9d7e',
    'd48db47e-acba-4377-b031-f6dfc21f3658',
    '3fd2204c-8654-4af7-832f-c186447262e0',
    'ae34e63e-13af-48b8-8b72-8137289091b3',
    'd3ed2595-b8be-40c8-b7b6-10a4997b9d2e',
    '575dc7da-58ed-407d-9e88-7b586f28bf65',
    '20ddfcd5-d456-431b-9f05-781e05d873d6',
    '15d41c35-943c-474b-afa6-e1c6d6e4be2b',
    '61803dc8-f649-43e5-ab15-d351f2cef629',
    'abe58d43-e1c7-4953-aa41-4d3b6f6cca44',
    '3ef7966a-ec1e-4dba-9d31-cdb33692e78f',
    'fa278604-7d85-4491-a30d-15c7821f8b00',
    'dd6c6688-b73a-464c-86d9-3369fdf98268',
    'b5c0bf91-9d20-41a2-9dd2-87d0ef0310f9',
    '97c636f9-0983-481f-8ff9-7b5b3ee6b10e',
    '15b153f5-ed02-4216-8f96-99743b8b4fc3',
    '747858c0-d139-4f52-9f0e-a618b880d6d6',
    '32bc49c6-7583-4613-a72f-5edb12b2a808',
    '3eb8ea77-4605-4bb7-90f9-671953abe4a2',
    '2b08cb76-061d-44c6-a00f-b43a5421df5e',
    'ab7e390a-adc5-4f9e-b317-a216a2904c93',
    'd1e6d0e4-d49e-4a16-93c6-7956b2c03414',
    '713f8676-8034-4827-bccc-cd6d95b1a4c4',
    'b00883d8-9251-435d-aefc-8a703d96d2fb',
    'eefbea02-0d65-441e-b455-35aa21d25ba3',
    'ff7e3be2-c0ac-4d97-85da-6229bf7585ac',
    '7ac92a42-e112-49c3-a8f5-8ad2c7ef5578',
    '0701aae2-8661-4eec-84e0-7c8be1c89a18',
    '7efb1905-34b4-4f1c-a8a6-8e64b3640a68',
    '11a2b088-8c1c-47d2-9c1e-455d457d2f05',
    '74608bd9-39e4-4f48-9b7c-1cd9d3c599c9',
    '7baf8e8c-de11-452d-b2e1-aad7c08cc18e',
    '23b0219d-0820-4017-b942-bda8578e90e2',
    'd7bcfc5d-e258-4bd6-a413-bb7a118e6bff',
    'a5f631ea-2b4b-43f2-9ea0-e31f2b11fa27',
    '8523489a-f57c-4993-81e4-1ed86a5c092d',
    '395da421-e6e8-4a26-ac93-eb7050a7cb1f',
    '9cb5ce25-38e8-4628-9ddf-d6aedf5efe0f',
    '615f6246-1c39-4e44-a9d4-c7133a2ae62d',
    'a647528d-925e-4c02-8825-ff54720c6ee4',
    '2c6f63b2-439e-499f-b687-b3fdd88a492e',
    '68a916af-2e0c-41bd-8535-c7eacbc2d1b7',
    '0e7f31a0-c712-4ebf-ab3a-64c37f43e52a',
    'c8b1d323-f352-482e-bf17-82075c23dcee',
    'd30f51c7-d642-4e7d-a168-967b9520a80a',
    '8d89608c-0d61-4d71-a2e3-9fbc6cda69bf',
    '71219f56-551f-4ad4-9a38-cc4aaf8a1e9a',
    '48dd6010-77dc-465b-a27c-695e29b57a5e',
    '3376a8b6-7ef6-4191-97ab-a547da0d330d',
    '21384132-1697-4e9b-b863-a6492d13285d',
    '582f5f8d-b96f-490e-b417-ba824baeb06c',
    '7e825ee6-7c03-43cc-b0a4-0d9203a30bd9',
    '2843292e-e494-4642-90e0-57e5c153f12c',
    '4ecbb7c8-0246-47f8-9654-4caca1d52565',
    '7e3ea1bd-95ba-4cad-90c8-3eec95be9cc8',
    'bbba696b-d023-4bb1-a213-c8bee31e8bae',
    '00bd45f9-beb2-4fb0-8680-bd30e392975a',
    'f85e467a-958f-4da5-a01b-8df883e69122',
    '470eee0f-2053-4d9b-9f5e-ca9661a6cc16',
    '5c1dc76d-b703-445c-9b38-cc2d00b9ab16',
    '0b25d09e-b2d9-4452-9810-1d0ef777f9d6',
    '608d793e-a78b-4872-a50c-21a9eaa60ec3',
    '74ede771-6781-4980-bfb9-5d853b7cdd6f',
    '6c47e282-5d5e-445c-b6bd-c0024946fbe0',
    '6ac178b7-a923-407f-8cd8-1733e1b2ebd5',
    'ff8ffbcf-c932-48c7-8d5e-d995d5680e21',
    '9d74b4f0-b2d4-46aa-867a-52fb6102bfdf',
    'e34f15f7-c225-4314-a638-90504bb0aa0d',
    'c1d222ab-bc0e-4e13-8379-0ee5be9e140e',
    '582187a5-ad63-4759-9162-55fa6337eb07',
    'd1e8d19a-970d-4ede-b5bc-9cab7237adec',
    'af867604-d801-41cc-9949-017eb30a0cbf',
    'c9dd3578-01db-4687-9807-4f71368941d1',
    '722e332c-fb1a-45fe-80c7-cc670f025b7f',
    '9f152896-ebf1-4756-b678-bdf739a92256',
    '478aa270-fbd4-4a45-8f63-221b4066168e',
    'e9c7ad29-2213-4648-9164-33a07bd42cdb',
    '1d140c76-a06b-42a0-bae8-b9e169ebe394',
    '5edcc3db-c676-412a-9506-600959bb81f2',
    '9f7dbe05-96b5-4b2f-9f3a-34b552e3dd21',
    'ccc524ab-d9ad-467c-a25b-9a14fb05e976',
    '9e3fb02d-dcf6-486f-a42d-89446a852057',
    '4b341ba9-49a5-43a2-9b7e-cc96beb59946',
    '15ae6390-6f6d-4fd8-9a51-ecf988676c4d',
    '3a3100bb-369e-47c1-a77c-2cacb7cf020d',
    '2c11b505-17c8-402e-8422-0239accb449d',
    'a6d7e030-e6c8-4c62-8cb5-165ef54987c4',
    'e25a8172-1e34-442c-a45d-583027a2d734',
    '0c18589c-6432-4a6c-90ce-985a47a66f39',
    'c911503c-f010-4c17-ac57-1d82e954bdc7',
    '487016d8-ea02-4b20-a45f-7382139aa865',
    '677f0bdf-6c5c-462b-8294-3666f777bbc5',
    '34da5c11-bbe8-4e55-8d89-9ef8a1c66200',
    '9a4d9d5f-72aa-4d7d-90f1-6d1181ee984c',
    'e6b15b39-daba-431f-a918-e4e43e702c30',
    'bef62e8a-5f5c-4e81-a8f8-ddeaf657b4e8',
    '128dce74-fa37-4f2f-8a80-d542edd81a11',
    '841970b7-bed0-4a75-a28a-a4cc59740a84',
    'a5f53fc8-8f9b-4e9a-af63-6f8c54d478b2',
    'f461fca1-80b2-4980-83a8-e165d49acc18',
    '37f0f1f9-83fb-49a1-9941-093c068c32d0',
    'cfb3dad7-c6d9-47c0-81b0-2133d75f5c0d',
    'c5c0893f-b254-4038-8d08-b28ef5a26b5d',
    'bbcf8529-1a04-43fc-b6cf-cb161028159d',
    '06421648-dfcb-4460-b93b-c7d6804dddbb',
    'e0b28b59-1cb5-44f4-ab8f-badf5c74f69f',
    '631deea0-2821-4d14-ad02-dc0ce4864924',
    '95788aa7-c897-4ae8-9166-4b8fc1fc5342',
    'eede320a-ed63-41d8-960d-5405a26a194f',
    '36dccf81-6932-43ae-9864-53379832d878',
    '9102024d-58c0-4bb9-aa55-12c00d98b6cd',
    '01eaf423-8cab-491a-b82e-6915dbc73594',
    '0481a135-9db1-424f-9065-a83ebd7ec995',
    'b60876c5-d825-4303-befb-ffff55b92aba',
    '49022563-1be1-4e42-a11c-01743cd5c94d',
    '64f2dbe1-6f58-493d-ab6b-c93568d828f4',
    'a9626803-72c2-4e23-968c-a090e3f22c5e',
    '095728d6-4ea1-4909-8a74-a8f3fa7f86cb',
    '8309cd89-a912-462a-90ad-f13ae0d7aa6c',
    '902596ce-714e-49b3-8271-f3dfece52309',
    'f5f29e4b-68f7-443c-b290-0827d4167fd5',
    '07c3a7f4-1e59-4dcb-a244-2fd3d084e2b0',
    '6f49717f-8f57-42d0-8548-316ecc292415',
    '7e693091-8ae4-4c40-8e66-c3b39f01b90e',
    '544f643d-b19f-4aa0-a6ec-a90e1a8681d6',
    '5137255f-0c58-4ac7-9266-bda8ab0247c2',
    '5243df74-712d-49a8-989b-528d15088e8f',
    '278a26cb-a710-4fff-928e-fc2e7084a75a',
    '58a1d168-8290-4c69-bf01-17ba3a084365',
    '9fc492f3-8d13-47ae-93e9-812c0224f1aa',
    '822d381e-cea0-45bb-8fa0-1b7194b4b64b',
    '2dc01a50-ea7b-4d9b-be57-1ffbdd98b27b',
    '5b6676dc-f46e-43a8-b87c-e431e369e53c',
    'eb35085f-0cbf-4829-a3ad-acaa53a250b5',
    '36fa2d20-622d-4cca-80b0-683672c94170',
    '44161b51-953d-4f6b-9448-5cba4a44a9d8',
    '0b90b2ea-8ca3-406a-9f69-95eddf7699ef',
    'bfb202bc-4078-4df0-82b9-9218dbc1f1a1',
    '8cc59f51-b0df-4a5d-a3c5-83ee526ff1af',
    'fda7c4b9-9f35-482b-9eff-be7f11058d94',
    '85646f4a-e424-4363-8033-1e7522e8f175',
    '8945794e-174a-49f9-a2d4-4242f9bf3833',
    '529343b4-698a-4b36-ac55-db8a6965ad3f',
    '0f949ee9-0986-42b2-af5d-0f4c8338c664',
    'cb1f06fa-b916-477d-8ab6-fb4b3f24efd3',
    '05253b3c-e8a3-4db4-8a6d-014eac7b3d94',
    '4807db90-b0f7-441d-b489-932f9b341f74',
    'c33b1f32-6021-4d1c-a4d5-fc3d501107f4',
    'aa314675-af62-41df-b5cb-3b22558e903b',
    '20741062-7d1d-44b7-bc33-39c9ad26e414',
    '69ce1be3-1815-43a4-bdd2-4696d9c8d09a',
    '76dd508c-aa80-4e54-9ac4-23b5e0545316',
    '1c6bef41-3cfa-46b2-b183-0a523e417457',
    '18716daf-4223-44a9-bba9-fc9baeef7d07',
    '475430c5-28cb-456d-9c5c-bdbfab9fafb2',
    'f0061cb3-688e-4ad4-aeb8-8614282292ec',
    '6905d8d1-da77-4f7c-86e5-3af7db2b00b4',
    '3a89c170-2939-4c12-9940-f32d96fa9e55',
    '9a06c401-da3f-41b4-b38b-238796fcae09',
    'b32d88c8-31e3-4789-a75f-e52bf1272937',
    '2a81cd6f-aa6e-436b-b4ba-68d5f713fb07',
    '5e0e8f9a-ce97-4b18-9540-3015c61e393c',
    '1c8ba244-1c7f-433a-825b-d2d34d018dcf',
    'efcdb584-7659-4780-9d6d-e6599fb0033c',
    '373ff2e8-0f63-4179-a55c-3fe0b85556aa',
    '352a503b-41eb-4a84-b257-68d70e55337e',
    '737d39b8-2f99-4eac-bcda-a03996e08939',
    '7577f264-8e84-440d-9346-7c4d5affda51',
    'febd8561-4769-4f3b-b7c0-ae7ff6ede2e9',
    'b8c5b185-8669-43d1-8ec7-c0f6d223d505',
    '31e61d00-61cc-46f2-a793-8ea8dfbb0832',
    '2355554e-8951-4b41-bcd8-32e18cddb7c9',
    'de1e7762-673f-4d44-8f45-7e693bb338b7',
    '239a484f-67c2-4ba3-a3d0-d6e4c2b27475',
    'af6c6f09-f0d2-46fe-bda0-c6fa5901c4a3',
    'c6f3bd64-ea67-488f-904f-f0bdf6320b5c',
    '166746e8-ce26-4fa1-a587-443ca9fc59a1',
    '80baf71d-28d0-4bca-81b7-49ddfadfa7a3',
    '6d18aafc-0240-499c-902e-a72a5b98ff0a',
    '263ab7c9-bd69-45dd-abb7-bbf35b9786ed',
    'e0c7877e-75d7-47d8-b5e9-5dd677d03353',
    '49a97523-0a7a-4d5a-ae20-496f86de2032',
    '583023a1-aa12-40e2-a964-8ad50ad400ba',
    'c56f0a76-2b91-4860-8dff-63c9504bb0e2',
    '17d3ffb4-e891-4ac6-a91a-fd52971c1115',
    '5203f051-7e84-4969-b4ce-eda56a859793',
    '488a38ee-f996-482d-a562-a4474f5594de',
    '680d748c-7c60-46e2-aea5-7fc557a916ea',
    '462d992a-7c13-45ac-a6da-1254fc3a9031',
    '4bc5b4eb-da91-48f7-bca0-134ed1a484a0',
    '6fd0f009-3c34-4529-9a38-c59745545490',
    '74d1e549-5ae8-4410-9428-f8f2cc85fa80',
    '868f72af-99e8-406e-9f7e-14577e6c7157',
    '73f7d2b4-86ec-4f7e-a1f9-37c7b023e3bf',
    '595b6755-e7ae-4e83-af2e-693c089aeec3',
    '732eaae3-b509-4a7a-8961-09d861e55253',
    '279e5670-8a47-4992-bb10-14e6c719db97',
    'd306000b-88c1-4220-8d7e-933c0118a983',
    '703c4bc2-81bf-435a-87fa-21dc9278bad6',
    'ab76b5ca-e464-4063-b949-853f61036370',
    '672b617f-936e-440a-a735-80f94798aed1',
    '516ceb43-1378-4c02-88fc-a1d2a2258d59',
    '033fc1e1-0337-4656-bbe1-3f06fef641e9',
    'bda2bec8-a142-47ab-bfb4-83759ac2bddd',
    'e858d4f9-3385-4640-b0cb-4894e86d501c',
    '39fe0c8b-bd78-4565-9415-63eabc1d6d85',
    'da4e904f-0346-4cd3-a5c2-ba932511d98d',
    'c46c2220-da88-4f60-a0cf-eebfd0a8ff12',
    '629e31cb-dd7b-4345-abf2-fa23c6c65a09',
    'e9a57082-5a93-481a-bbd0-1acb03ac751a',
    '0faf149d-b316-4fbd-8605-a59354f0eacd',
    '1d23d3cc-5db6-4734-bfaa-507dd366d99b',
    'dd58f556-0049-49c3-9a51-d6470a2abddc',
    '53185d06-f2cc-4942-88c4-8534b559a9ff',
    '0132f320-830d-40d0-a4da-06a5d5f9e8d9',
    'bb7d6408-941a-4da6-8613-36498bc6d91b',
    'b8d11ca1-3db8-4efa-bf57-0305e004a26d',
    '9bd56ad6-080e-4d26-acca-83e4df8aa913',
    '00c11c7e-8530-4bfc-abd7-8c10f4c602d3',
    '8f6b9e20-9468-4f46-aa45-eeab9de88e53',
    '272dff18-acf3-4874-a55f-ba8fb6f80352',
    '70ac3659-06bb-4022-be55-af81d3e35b6f',
    '409b92cf-5c4d-4997-9736-ef2ea10d19e9',
    'a8636719-e26c-49b6-9a53-7d77f3d3c94b',
    'a3e81d5f-8dd6-43dd-9172-d80d212efa2d',
    'd40af129-c13f-45b2-92f0-d0e8fa5cc1c9',
    'ecd2d2f9-2b6f-4743-8d04-c9bb554a96cb',
    '9ee78822-7acd-4fab-9999-c58e9fe266ad',
]

# Establish API client
creds, project = google.auth.default()
auth_req = google.auth.transport.requests.Request()
creds.refresh(auth_req)
config = data_repo_client.Configuration()
config.host = "https://data.terra.bio"
config.access_token = creds.token
api_client = data_repo_client.ApiClient(configuration=config)
api_client.client_side_validation = False
datasets_api = data_repo_client.DatasetsApi(api_client=api_client)

# Loop through enumerated datasets and create records for those related to AnVIL
print(f"Start time: {datetime.datetime.now()}")
file_format_count_dict = {}
file_format_source_dict = {}
datasets_list = datasets_api.enumerate_datasets(limit=2000)
dataset_list_len = min(len(datasets_list.items), len(dataset_id_list))
records_list = []
dataset_count = 0
for dataset_entry in datasets_list.items:
    if len(dataset_id_list) == 0 or dataset_entry.id in dataset_id_list:
        dataset_count += 1
        logging.info(f"Processing dataset {dataset_count} of {dataset_list_len}")
        if dataset_entry.default_profile_id == billing_profile:
            # Retrieve dataset details and pull source workspace(s)
            dataset_details = datasets_api.retrieve_dataset(id=dataset_entry.id, include=["ACCESS_INFORMATION", "PROPERTIES"]).to_dict()
            bq_project = dataset_details["access_information"]["big_query"]["project_id"]
            bq_schema = dataset_details["access_information"]["big_query"]["dataset_name"]
            client = bigquery.Client()
            file_ext_query = """SELECT file_format, COUNT(*) AS file_count FROM `{project}.{schema}.anvil_file` GROUP BY file_format""".format(project = bq_project, schema = bq_schema)
            try:
                df_output = client.query(file_ext_query).result().to_dataframe()
                for i in range(0, len(df_output)):
                    file_format = df_output["file_format"].values[i]
                    file_count = df_output["file_count"].values[i]
                    if file_format_count_dict.get(file_format) == None:
                        file_format_count_dict[file_format] = file_count
                        file_format_source_dict[file_format] = [dataset_entry.name]
                    else:
                        file_format_count_dict[file_format] = file_format_count_dict.get(file_format) + file_count
                        dataset_list = file_format_source_dict.get(file_format)
                        dataset_list.append(dataset_entry.name)
                        file_format_source_dict[file_format] = dataset_list
            except:
                pass   
    
# Build output records
records = []
for key, val in file_format_count_dict.items():
    records.append([key, val])
df = pd.DataFrame(records, columns =["File Format", "File Count"])
df_sorted = df.sort_values(["File Count", "File Format"], ascending=[False, True], ignore_index=True)
print(f"End time: {datetime.datetime.now()}")
display(df_sorted)

In [None]:
file_format_source_dict.get('.parquet')

## Validate file extensions

In [None]:
def validate_file_extensions(dataset_id):
    
    # Retrieve dataset information
    src_schema_dict = {}
    api_client = utils.refresh_tdr_api_client()
    datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
    try:
        response = datasets_api.retrieve_dataset(id=dataset_id, include=["SCHEMA", "ACCESS_INFORMATION"]).to_dict()
        src_schema_dict["tables"] = response["schema"]["tables"]
        bq_project = response["access_information"]["big_query"]["project_id"]
        bq_schema = response["access_information"]["big_query"]["dataset_name"]
    except Exception as e:
        return "Failure - Issue Retrieving Dataset Info"
    
    # Check files with improper file extensions
    client = bigquery.Client()
    file_query = """
        SELECT COUNT(*) AS file_count, SUM(CASE WHEN file_name LIKE '%'||file_format||'%' THEN 1 ELSE 0 END) AS match_file_count
        FROM `{project}.{schema}.anvil_file`
        WHERE file_format IS NOT NULL
        """.format(project=bq_project, schema = bq_schema)
    try:
        df = client.query(file_query).result().to_dataframe()
        if not df.empty:
            file_count = df["file_count"].values[0]
            match_file_count = df["match_file_count"].values[0]
            mismatch_file_count = file_count - match_file_count
            if mismatch_file_count > 0:
                return f"Failure - {mismatch_file_count} files have extensions that don't match the file name"
    except Exception as e:
        return "Failure - Issue Validating File Extensions"
    return "Success"

# Loop through datasets and validate file extensions
dataset_id_list = [
    '8fbfea50-6a71-4b19-98e9-f95e3a8594c7',
    '5627cdbb-22a0-436f-a7a4-34d7ce21bb45',
    '179eb85e-2557-4677-9cba-d763310f3df9',
    'bad1fb5c-d263-48d7-8e4c-fa873a17d707',
    'd239dd7b-8d10-4960-aa91-8f8ede641e25',
    '2d434f2c-6aaa-46b2-ada9-de4b887e13d3',
    'd6518df9-fc11-46ed-9c12-b9782d3829a0',
]
results = []
for dataset_id in dataset_id_list:
    logging.info(f"Validating dataset_id: {dataset_id}")
    status = validate_file_extensions(dataset_id) 
    results.append([dataset_id, status])
    results_df = pd.DataFrame(results, columns = ["dataset_id", "validation_status"])
display(results_df)

# Pull Search Facet Values Across Datasets

In [28]:
# Input parameters
nonarray_field_list = [
    "anvil_biosample.biosample_type",
    "anvil_biosample.anatomical_site",
    "anvil_donor.organism_type",
    "anvil_donor.phenotypic_sex",
    "anvil_file.file_format",
]
array_field_list = [
    "anvil_dataset.consent_group",
    "anvil_dataset.data_modality", 
    "anvil_dataset.registered_identifier", 
    "anvil_diagnosis.disease", 
    "anvil_donor.reported_ethnicity",
]
dataset_id_list = [
    'cefc1a79-446c-40d2-b140-ba8d8b1c0712',
    '4e699ead-bbb5-460d-9b32-2b1b322c601b',
    'cefc1a79-446c-40d2-b140-ba8d8b1c0712',
    '2355554e-8951-4b41-bcd8-32e18cddb7c9',
    'bafbf771-1cd2-44fc-9b38-5a4bbead8ab2',
    'a36eeaf7-d6dd-4887-bdbd-e435a07ba156',
    '8de6dae2-55ff-4287-9b75-5b2a950c1f44',
    'ce6692aa-0f97-48fa-8628-b8fa3eab4726',
    '31433635-91d4-431d-8d26-bc54e84c8e8c',
    '0b06619d-39d9-4437-8c42-2e415faa634c',
    '12ffb586-5f6a-4f0a-a353-d2f34599f4cc',
    'e642bca0-52fb-4ab3-ab3a-acaab83deda7',
    'b7fb531e-25a4-427c-9679-b7bdc3d03535',
    '3615e063-f24b-47f7-87cb-430e8aca8d0c',
    'a3ea4f97-6657-4d3c-9be6-96f097f5c952',
    '9f4ac69c-0919-4ac1-98a8-976ed79ace03',
    '96461004-f4b3-4f82-a842-293b3ec46a60',
    '841b7883-9447-4ea0-ae4a-84ea0240d919',
    'd0fc3d6a-c3f4-4533-8a23-817a4e27f9be',
    '2cace5dc-f660-45d4-b689-c4c89e77697c',
    'd2272f2d-c606-4027-b8ea-0bdd6d9d6535',
    'b12fb9be-2ce0-4bfd-8503-732fabba06ab',
    '9f9fc99a-b867-49a9-a3dc-8a39efbd5fa2',
    'ce58654d-b7d3-466b-99ba-b203d527a543',
    '179eb85e-2557-4677-9cba-d763310f3df9',
    'cba804c9-0bdd-4219-a53e-98c8db6334a0',
    'd239dd7b-8d10-4960-aa91-8f8ede641e25',
    '5c6a1c4f-ccd3-48a8-ac00-e18e5ecaa0bb',
    '19e2c8ab-853a-4204-86c3-f591125fbf63',
    '7cf0d3d0-f79b-4bfe-bfc8-e4e6c33dd4c3',
    '3a72e4b8-afb4-4299-98ec-a9ba9606be06',
    'f3c89298-0dd2-40da-8627-3baea553b34a',
    '9a32e23e-840d-4ba3-8cd9-392f48b8e9d2',
    'c5d967fd-09ce-4b02-97dd-ac3abf6f79fa',
    '173e56f7-b813-4c41-89ff-09a824e1407f',
    '80312f74-bd56-4938-96ba-e9bed95d1f3b',
    '017445d7-d56e-4e2e-b480-b4879b51e944',
    '13b2076a-cfe1-49ec-ac61-bad1af9a52ea',
    '175dd803-02c7-4823-81d5-9e0621652ace',
    '15492baa-05ed-47bc-b50c-e587679ae51a',
    '33705ce9-b2b3-4edc-9b47-f54283e193cf',
    '92486440-3a46-44dd-b853-b300ef75b31e',
    'dd2cb8fc-42a6-482f-898e-ef6125feccb8',
    '4e99b8e1-40b9-4fb2-90a0-d85e926ef31e',
    '128332b6-5060-4ec4-b6a6-f53b54a810be',
    '06f05f58-3c83-4f5c-bddd-bed7d2d1d147',
    '51e9935f-ec18-4832-801a-6d9186537572',
    'd6291444-8c3c-470c-b28c-7cf1d5c7aad8',
    '41cb9f29-4ba6-4690-821c-cb085e6b0f2f',
    'e68d1d39-99df-4cd7-8053-1b298f03eabb',
    'b252e3ac-4a8c-48e0-9999-5ee0c9a5842d',
    '7ea006d9-1e19-4678-b2e6-d4a1ea327f74',
    '34fd3b22-ac73-47d2-8849-5877158ec072',
    'a08dc7a6-f8ce-4205-95d2-83f614c2c32f',
    '577f36fe-8154-4c82-ac87-b2a64cb68f35',
    '7ce3270e-b2f2-47f4-a288-639751b2f87f',
    '36bdd59f-4f5b-43cd-8d34-a21ef87bbf30',
    '41d12dc1-8718-4439-b409-26cc23573107',
    'c4c49fcd-0c20-4cff-841a-cb58f5689c5b',
    '6b40557c-ddc3-4e7e-8a45-1761e7fcb8b5',
    'd6518df9-fc11-46ed-9c12-b9782d3829a0',
    '9ee2a552-89f8-4a48-9c94-9fa26ebb7483',
    '425412ba-894a-4824-acb8-bf18fe4576e0',
    'f22bd762-5c45-453e-bf22-b174514abb84',
    '0ee62643-b064-42f8-9b09-5d10eacd70a3',
    '1a7f6728-5116-4f24-897a-59a7f322cfd2',
    'c37b388c-7107-43d6-bee6-4e82b40ed271',
    'bf6f1d78-6a0d-4afb-aea6-17a3c34340db',
    'a3becdde-018b-46f0-adea-d587076eef4a',
    'a9ad3a05-24fb-4e59-85b0-ee09e55a4492',
    '719f7581-21db-4aec-8c46-4a5811832710',
    '3725b660-1106-4173-9c4b-0a15926becf5',
    '318a75f4-ac50-4944-81b0-70a1323e7497',
    '75fb0984-2124-444f-881b-30a1a6f8b8f7',
    '15be288e-53e1-41cb-8d20-8ea87efb9258',
    '700303c2-fcef-48a5-9900-096bf34e2d83',
    '38fd20ce-affd-4791-9810-7f5a7fe876d0',
    '8b8185d3-ba5c-4832-af23-3ff8ca6ed016',
    '140797da-dc94-4fc2-8b0b-f2e1dec7bd43',
    'ec97fa0f-e174-40fe-a6b8-ee240bdf4318',
    '5488d7c1-5195-4ebc-b0f0-31033fa06dc9',
    '56f9888f-e623-4a1a-b2b4-46378a6cd6fe',
    'b2e7f15b-65d5-4812-abfd-b2dbc6d18850',
    '69f8d7c2-2e14-48e6-b838-7881016313fb',
    '809fa952-3178-46b7-bb82-8a476ef32e67',
    'dfb14a1f-38b2-4668-b98f-59b5b5b53ca3',
    'f553b765-1c9c-464c-a8fa-07700a1691c5',
    'b108dfd0-711d-4bc1-aab5-1b312226c8ad',
    '38eca26c-d79e-4447-99d3-1889d20ade21',
    'd5a0e24d-689a-4854-92c7-9a39f980b523',
    'ecd0606f-4fa7-4e57-b6e8-eea377e65d5b',
    '0c6bc810-5ae0-4926-a56b-2bc2fe7dbe6d',
    '9dc31133-c882-4f39-903a-a25f316bb560',
    '23a0ede0-4f97-46af-9f04-bd2805050980',
    '7593c1c2-3680-4bf5-8a65-dce5f96a3b59',
    '02661394-2886-4ef7-aff1-d53225c82025',
    'c1644d4e-06e2-4fa8-95f1-5c1da5831257',
    '267cf516-dd33-4640-a71a-78bd8f5db9d8',
    'df06ff22-6a2d-4934-aac9-c8368efbea1a',
    'dcdefb14-f6de-4c46-ac7e-842b273416bf',
    '28208cc6-50bf-4864-9a48-981632066640',
    'ccfe264d-a35e-44f8-9b2b-241a0f8327cc',
    'f177843d-47fb-46ae-83be-73c92ee85081',
    'd049d487-1a69-4358-8dad-0e6fa6c06fdd',
    'da29226b-e856-4014-8c8b-c4268d0df2cc',
    '714dcdbd-8d17-40b0-8246-0e941af8175d',
    '53ce7d12-facc-4412-a710-f535efb209a2',
    'b9842819-5fd6-40c5-9668-aae1ea44a308',
    '7c056125-3ed8-459c-b73e-edfa3f80cc27',
    'ed1215f1-787c-40f5-9d77-4b5bc2dfbb84',
    '7feeb2b1-1926-4968-b6dc-e0a1e4cf8d4a',
    'dbf5d87b-4cab-44cd-a792-1d0218aad973',
    '85dbde76-c130-40b2-8a8a-ba815ba499da',
    '1c2fe11d-b020-4c54-8c71-1ea91623d626',
    '84133066-68cd-41fa-819b-d74a3ac85862',
    '54c6fa73-9b84-4a3b-9e97-e4e43165c48b',
    '90ba1853-f845-4502-ba36-b75b9e571bc5',
    'f3c88c3c-8e1b-4af9-9467-0621404e314c',
    '332bb145-6ef1-40ef-932c-aec5bb6210d9',
    '44f83f20-d618-40b5-b2cb-3676b8fe3ad7',
    '3c2c39a9-4cc2-4f7c-89e0-054a871e2c4e',
    '2ebb722f-a3df-4ea4-b72a-813e3db0bab5',
    '1817528a-4f88-4ed6-8965-9eae0220ab27',
    'e03eb011-05f9-4491-b779-0cc2aefabff1',
    'd4bb7169-5a7d-4090-ba62-12ea799c3ade',
    '3f172982-060d-4339-a09b-6994c2c9eb16',
    '51789659-5233-4ee7-8bca-dedebfc87773',
    'a5fe75bb-d28c-42fb-aaf8-92fa37b266d2',
    '6545d602-e5b4-4dd1-8f6a-64e0a1952ddc',
    'e5c79b74-20d5-4b6f-8085-0bc788eed2ea',
    'f492567d-6db8-45c8-b44e-6b5def26c812',
    '12e54f96-00d9-4a38-921e-e0d42610b2c1',
    '845b131e-7c05-4397-ad40-23dea8e9b399',
    'bd492b71-b20e-4056-b8ae-ad8c94cfbc02',
    '9ecc231f-e3d3-4417-a98a-c4db4c638161',
    '3fb2d04a-d18b-4bdc-9372-99b992f2ae42',
    'e922a496-e686-4fa1-911d-2159ceb0f09f',
    '8fbfea50-6a71-4b19-98e9-f95e3a8594c7',
    'd911e57a-ebb8-4be8-876b-d8e5790ddce3',
    '6c9423a2-3ea7-4c3c-9b12-0cc993bc095f',
    '52e015b5-22b7-4a96-9f0a-ea3afccbfcbc',
    '325f3ee8-2adb-4092-bd78-1b5ea5b0d1d6',
    'ae50ef98-ef3d-4427-b094-83b2d90787a0',
    'e6771964-50e9-482f-9d23-18c22cd89ab8',
    '3fbacc64-4c53-4770-8cdf-a616c10ec5c7',
    'a3ae33bb-8b3a-47e5-a2d1-a49c954776b3',
    '0b0a52bb-a1a2-4638-9259-4447761c2da4',
    '0eb42259-7b44-450f-a9d7-500b2ea7179c',
    'e16adabb-88e0-4739-983a-98ac5c181842',
    '71f94dff-fbe8-4881-af1f-4987b67d5181',
    '5627cdbb-22a0-436f-a7a4-34d7ce21bb45',
    '0e65b131-fd14-4fce-908b-c5b89a71a9c1',
    'd56ae233-d6d2-483c-917e-1de0fe1cfeb7',
    'd00353de-f6f9-42d9-8a8f-f88b3d880dbf',
    '3be57453-9325-4c2e-b73a-832139b61778',
    'fcc60ac9-0d20-4a7c-97e4-e3c8d3aa8f76',
    'dc5f85d8-333b-4b68-b160-ad9856233887',
    '655e6a61-5400-4d8a-95bc-1506e026b289',
    '1f2d14d4-1bd8-46fc-9d35-1a415e5f326a',
    '64fd39fc-b32e-4b0a-8f83-4bf11b197462',
    'a77a2c65-38fe-4bf7-9ea6-0a2dc65eb21f',
    '25248cd8-2e98-4a83-9ccf-af7214fa71d6',
    '158ebecd-4596-4541-b832-a137232b7036',
    '1ccb95c3-1901-428e-b7bb-34495f41f4d2',
    '02ff1051-cd1d-4bbb-a005-21384cbff846',
    '0144b0d3-a809-46df-8c67-7ce42bdd579a',
    '35a1009d-93a2-49b1-a801-fe84d6b7a2f5',
    '50132478-c9fb-4dc5-86cd-d5dfab909393',
    '35064fc1-6c52-4005-8e99-cb0d6afd3f8c',
    '5cf859f6-990c-4b04-8609-35d5c57920f0',
    '62cfdce6-2d4d-415c-a11e-5ab60131c668',
    '2d07dd45-a263-440d-a339-9ccbab93aba8',
    'f1513955-0264-4733-bd25-3f752c61a323',
    '93e712f2-3e54-466e-aa53-57eb69c43bc0',
    '296f653a-91a8-4139-9bab-e6ae13afe99c',
    '633dc1aa-084d-43bd-9b17-bc6e57f81d48',
    '9320b3b5-3944-4bd2-913a-23b72bccd86c',
    '86ab4d3b-86ce-422b-ae6f-1ec6968a874d',
    '4124010f-7308-4831-80d7-ea14343249ab',
    '3037caeb-fa7a-4924-b399-7e4c7173b3b8',
    '146b72bc-1dcc-4e3b-bcda-d3dd25418012',
    '1d575e14-c3b1-4ead-a63b-a21c08c6a14d',
    'c5b1e333-7203-41ce-b8f7-3ef3a3bd721f',
    'bf519ea2-afe1-486a-9954-7362f10b6b60',
    '254ffffc-2bd6-4b2e-905b-a8c54c348cd0',
    'bb65d291-a673-4e4d-8a37-ab1f7401a902',
    '3a9604d7-456a-453d-a46b-40408624a07e',
    '2cbe079d-e7ab-47d8-836e-454a71440297',
    '84fad495-2756-472f-ad20-f91de6f67baf',
    '28e73469-12d4-493b-bf6f-83359c1f69c5',
    'db266afc-2f75-4b03-a3b8-c69e0ce6f713',
    'dd6866e4-8949-45bd-8910-8ce64f79e3c7',
    '12bbfa4c-c30a-4cf6-b79f-45354f842964',
    '84ac0d05-4be5-43e9-973e-ef999144d802',
    'ff8b1212-858a-4048-8f63-9464c922591a',
    'c814d754-cdc5-4b0d-8671-a39e85b2c473',
    '797b2563-5d56-4f5c-bdaf-3bfd11e8f5b3',
    '85287d84-fefe-40df-ad40-5b135ee0c07f',
    '7eeede5a-c86f-4577-9f3c-65ab618a6dee',
    'a52c04ee-cfef-46bb-9b40-6a9b292e1a7b',
    '0194eea9-d779-4957-8521-11717a378e66',
    '0d82658c-44b3-4cea-a388-3353a96a31ef',
    '77dca0d5-4d22-4415-8858-075590d25cb5',
    '1b05159b-6277-4345-9d59-f7bba5ea1d56',
    '92299ff4-c0d0-4e94-b374-75d0038cbd68',
    '60f96582-79ad-4461-9f9a-53c1bc3d17b6',
    'cc107de7-d623-464a-a875-c8b7ae5fb09d',
    'b5d7c34a-c383-4fc7-aa4d-b6dc941cd41a',
    '85baa8f8-619c-4165-9d3e-53220f645814',
    '3f278de3-f201-4344-9639-d35cd7a62adb',
    'c423b18d-12f2-43e4-97f9-993e2943270e',
    '416b8daa-9537-46db-ae7b-3f5ff5f01dc3',
    '61940344-e6c1-484e-ba10-131f43a9b13a',
    'ceee2791-0fdf-45fc-a4e8-8077916771aa',
    '5205f817-7de0-48b4-89fc-6398cf13bff7',
    '1d2f5472-ab6a-4a9b-ba53-520858cf79db',
    'f757278a-3c74-4690-bf89-5149d21ff3af',
    '5a103ab3-29c3-4d07-a0f6-4999c256cf26',
    '2a263db0-8c33-4171-840f-54bf4755a4b9',
    '9828f3fe-f676-4bf1-b600-5effa24ea9c8',
    '28849dc9-a97f-469b-b2ac-a8ff97693f02',
    '29cd0578-fb47-495a-8f48-b37325eed81a',
    'ba503d2e-48af-48bb-910a-be41790d921c',
    '472f01ad-7bc3-4fe5-9771-2695930dbc95',
    '956cd931-7077-4a08-9c75-ab8b4e5d1eb8',
    '31a42df0-29f6-4d4e-ae5c-2e13abc355f2',
    '13364604-ed08-4a61-89cd-65eb372ac8c3',
    'b724164c-712c-4615-97b7-529a108a753a',
    '22199347-9454-41e5-8912-eb38edd33a25',
    '60cadee3-9e63-4897-ac81-4fb283033648',
    'dcd4112f-09d3-43ed-8441-df9bf4c9ddc8',
    'da02c3f1-371b-4afe-9b5e-b8c584fd5907',
    'f6565f2f-4478-45ad-8c11-04dd242fc6a9',
    '275ea204-4612-4d3c-ac0d-f110f61d62ad',
    '72f73fc5-6a3a-43a0-8cce-09f4726b736c',
    'be72f1e3-b5f5-43f4-80db-6d7de93a654e',
    'b6bf4699-6f61-4c6a-9d42-ad055a0de008',
    '8abf299c-cd4e-4ce0-b5cf-4f9abe8cc891',
    '0b6eb077-2eca-4fe6-b012-26fab725b907',
    '5c659e81-e687-4710-a4fd-000ca593155d',
    '42965913-4223-484a-9b3d-abc0002d277d',
    '72e639d1-b8c5-45fd-9acd-a8e5e2b7fa0d',
    'd6823ccd-7247-4efc-8841-f53f456351ed',
    '1048a860-d5ff-4f61-95e5-851e1266d4c1',
    '8681cdcf-b775-4b56-aace-3f3e448261ef',
    'f0db3b27-c952-477d-bc33-9b96a250e168',
    '9e1a6a7d-b45e-4fd2-a1ff-df131da4c713',
    'd596ee91-481c-4eb5-9a8a-88c1e10ba9b6',
    '24470eb6-97c2-4cd4-b484-87a7d634c5b3',
    '3a781e70-cf6e-41c2-8d68-2326f16986e7',
    '048afc84-cdd2-4b39-8ea5-7351f4699761',
    '3fdcdafd-5328-418b-85f8-47b0006de468',
    '032d39fb-d278-427d-b7d2-de648a25a20c',
    '27acea14-41c9-4bf9-ad43-3ebb3ce90456',
    'ec6f49a2-176c-4564-82c5-e751baab46aa',
    '68ea655f-b4a3-43e2-95e4-f158ca2d67dd',
    '0447c960-bbfe-4e42-a95b-dd3d1d9a368e',
    'cc19d19e-6f7e-41b8-87a6-77f41d53e650',
    '16031a34-f1ba-4bde-af43-1822f1516944',
    '483d3454-54da-4243-bbeb-98cbf1d088d0',
    '6765ce2d-ebc8-4367-8855-c0f8e62cb355',
    '1b4a324a-5621-4399-85d2-f91aa03418b5',
    '09642596-d33a-4261-8bf7-eb1dbb37d572',
    '75119ed5-b8aa-4f45-bdef-e3c673bbe44c',
    '04a874df-c57b-40fc-9139-bc3a05129115',
    '2b8ad26a-e66e-4b03-a65a-5b504cecacfd',
    'a7226f10-bdba-4284-97b3-0738a5912770',
    '93b2ac60-2208-4ef8-a1c2-68a623e45807',
    'a963c15d-9c97-49e4-af95-cdee96333a76',
    'fb5d9952-ebe7-4ee6-ba00-819ed00f3593',
    '8da05494-fe7a-4af5-b257-bada143ee426',
    '8b098ab4-df02-4619-8ded-657e496695c1',
    'd48adc59-8934-41bb-9720-63e71f1933be',
    '8e88cabc-e713-44ed-a5d2-41935c3b4eb5',
    'be8cfc23-cd19-46fb-92e1-a77ac380d7aa',
    'e2a398ff-18c3-4258-9d75-89adb2923e88',
    'cb8ebcd0-bb5e-4a6d-bfef-5c651a1a9f6e',
    '6238f8f7-5efb-4023-8d85-ef7db9b4dad7',
    '32c09444-3d4a-44d5-af6b-07eef92189db',
    'd7686f98-05a4-45c9-af2e-3ebc524a5b2d',
    '1939b7ae-fc6b-42a8-ad5f-dc51a1682a17',
    '8ccefc59-38a5-476f-b7d3-3f98315a97f0',
    '2cda53ba-b852-47e8-8f24-59ab8e9f1d1f',
    '6e67e1e1-5c39-43da-960f-48385789c4e1',
    '92382848-f5e9-426c-b7dc-f2841ae97018',
    '4999a410-990e-484b-b4f3-d636f894a741',
    '1f534eb4-701f-4182-9895-64c5e5b52d82',
    'd01a4268-1bfe-4a2d-a2d4-e296162c406e',
    'feca4815-b44b-4b2b-8d77-75edd62ba5a6',
    '039dd3d6-0cb5-4cd1-86b3-e9579c9b5218',
    'd0ce8b95-9c3b-4f9e-8ce0-169fd89a8b20',
    '7427b2eb-a84f-413c-bfb0-7d2e36b0628f',
    '9d796a02-e2aa-4c15-b8d6-1e90cd736681',
    '28c3df75-0b08-4d5c-9feb-6e2e918572ea',
    '433e3a09-661a-46a5-96f2-dbb07bdc87f3',
    'f69c21e9-cb5f-4e72-acfe-c54b672a9f3b',
    '2ef4530a-cc36-4f32-9a1a-63a555346587',
    'e917c83d-c482-442d-81ce-869de7d20903',
    '10774229-1487-4188-b2c3-1fabcf85492a',
    '65793118-3c88-4185-9172-2354850e6056',
    'fcb03f4f-e685-4803-aadb-0e8940ff4f37',
    '46536136-08e4-4521-8e6c-67f023de020d',
    'c2f0e7cf-ac07-48f7-b5f1-497ee6c134b2',
    'b8e7fe18-9c3d-4cc0-bbc7-85b27197fc8f',
    '3abfc362-7e73-4663-9dcf-07b78b9aa2d4',
    'b60b4737-c646-4299-85a0-520890e830b7',
    '280c5d6f-39a3-4d1d-aad2-a174451cd9b2',
    '2d434f2c-6aaa-46b2-ada9-de4b887e13d3',
    'bad1fb5c-d263-48d7-8e4c-fa873a17d707',
    'e4ccd185-2b0c-445d-9c57-0dc45c8f9d7e',
    'd48db47e-acba-4377-b031-f6dfc21f3658',
    '3fd2204c-8654-4af7-832f-c186447262e0',
    'ae34e63e-13af-48b8-8b72-8137289091b3',
    'd3ed2595-b8be-40c8-b7b6-10a4997b9d2e',
    '575dc7da-58ed-407d-9e88-7b586f28bf65',
    '20ddfcd5-d456-431b-9f05-781e05d873d6',
    '15d41c35-943c-474b-afa6-e1c6d6e4be2b',
    '61803dc8-f649-43e5-ab15-d351f2cef629',
    'abe58d43-e1c7-4953-aa41-4d3b6f6cca44',
    '3ef7966a-ec1e-4dba-9d31-cdb33692e78f',
    'fa278604-7d85-4491-a30d-15c7821f8b00',
    'dd6c6688-b73a-464c-86d9-3369fdf98268',
    'b5c0bf91-9d20-41a2-9dd2-87d0ef0310f9',
    '97c636f9-0983-481f-8ff9-7b5b3ee6b10e',
    '15b153f5-ed02-4216-8f96-99743b8b4fc3',
    '747858c0-d139-4f52-9f0e-a618b880d6d6',
    '32bc49c6-7583-4613-a72f-5edb12b2a808',
    '3eb8ea77-4605-4bb7-90f9-671953abe4a2',
    '2b08cb76-061d-44c6-a00f-b43a5421df5e',
    'ab7e390a-adc5-4f9e-b317-a216a2904c93',
    'd1e6d0e4-d49e-4a16-93c6-7956b2c03414',
    '713f8676-8034-4827-bccc-cd6d95b1a4c4',
    'b00883d8-9251-435d-aefc-8a703d96d2fb',
    'eefbea02-0d65-441e-b455-35aa21d25ba3',
    'ff7e3be2-c0ac-4d97-85da-6229bf7585ac',
    '7ac92a42-e112-49c3-a8f5-8ad2c7ef5578',
    '0701aae2-8661-4eec-84e0-7c8be1c89a18',
    '7efb1905-34b4-4f1c-a8a6-8e64b3640a68',
    '11a2b088-8c1c-47d2-9c1e-455d457d2f05',
    '74608bd9-39e4-4f48-9b7c-1cd9d3c599c9',
    '7baf8e8c-de11-452d-b2e1-aad7c08cc18e',
    '23b0219d-0820-4017-b942-bda8578e90e2',
    'd7bcfc5d-e258-4bd6-a413-bb7a118e6bff',
    'a5f631ea-2b4b-43f2-9ea0-e31f2b11fa27',
    '8523489a-f57c-4993-81e4-1ed86a5c092d',
    '395da421-e6e8-4a26-ac93-eb7050a7cb1f',
    '9cb5ce25-38e8-4628-9ddf-d6aedf5efe0f',
    '615f6246-1c39-4e44-a9d4-c7133a2ae62d',
    'a647528d-925e-4c02-8825-ff54720c6ee4',
    '2c6f63b2-439e-499f-b687-b3fdd88a492e',
    '68a916af-2e0c-41bd-8535-c7eacbc2d1b7',
    '0e7f31a0-c712-4ebf-ab3a-64c37f43e52a',
    'c8b1d323-f352-482e-bf17-82075c23dcee',
    'd30f51c7-d642-4e7d-a168-967b9520a80a',
    '8d89608c-0d61-4d71-a2e3-9fbc6cda69bf',
    '71219f56-551f-4ad4-9a38-cc4aaf8a1e9a',
    '48dd6010-77dc-465b-a27c-695e29b57a5e',
    '3376a8b6-7ef6-4191-97ab-a547da0d330d',
    '21384132-1697-4e9b-b863-a6492d13285d',
    '582f5f8d-b96f-490e-b417-ba824baeb06c',
    '7e825ee6-7c03-43cc-b0a4-0d9203a30bd9',
    '2843292e-e494-4642-90e0-57e5c153f12c',
    '4ecbb7c8-0246-47f8-9654-4caca1d52565',
    '7e3ea1bd-95ba-4cad-90c8-3eec95be9cc8',
    'bbba696b-d023-4bb1-a213-c8bee31e8bae',
    '00bd45f9-beb2-4fb0-8680-bd30e392975a',
    'f85e467a-958f-4da5-a01b-8df883e69122',
    '470eee0f-2053-4d9b-9f5e-ca9661a6cc16',
    '5c1dc76d-b703-445c-9b38-cc2d00b9ab16',
    '0b25d09e-b2d9-4452-9810-1d0ef777f9d6',
    '608d793e-a78b-4872-a50c-21a9eaa60ec3',
    '74ede771-6781-4980-bfb9-5d853b7cdd6f',
    '6c47e282-5d5e-445c-b6bd-c0024946fbe0',
    '6ac178b7-a923-407f-8cd8-1733e1b2ebd5',
    'ff8ffbcf-c932-48c7-8d5e-d995d5680e21',
    '9d74b4f0-b2d4-46aa-867a-52fb6102bfdf',
    'e34f15f7-c225-4314-a638-90504bb0aa0d',
    'c1d222ab-bc0e-4e13-8379-0ee5be9e140e',
    '582187a5-ad63-4759-9162-55fa6337eb07',
    'd1e8d19a-970d-4ede-b5bc-9cab7237adec',
    'af867604-d801-41cc-9949-017eb30a0cbf',
    'c9dd3578-01db-4687-9807-4f71368941d1',
    '722e332c-fb1a-45fe-80c7-cc670f025b7f',
    '9f152896-ebf1-4756-b678-bdf739a92256',
    '478aa270-fbd4-4a45-8f63-221b4066168e',
    'e9c7ad29-2213-4648-9164-33a07bd42cdb',
    '1d140c76-a06b-42a0-bae8-b9e169ebe394',
    '5edcc3db-c676-412a-9506-600959bb81f2',
    '9f7dbe05-96b5-4b2f-9f3a-34b552e3dd21',
    'ccc524ab-d9ad-467c-a25b-9a14fb05e976',
    '9e3fb02d-dcf6-486f-a42d-89446a852057',
    '4b341ba9-49a5-43a2-9b7e-cc96beb59946',
    '15ae6390-6f6d-4fd8-9a51-ecf988676c4d',
    '3a3100bb-369e-47c1-a77c-2cacb7cf020d',
    '2c11b505-17c8-402e-8422-0239accb449d',
    'a6d7e030-e6c8-4c62-8cb5-165ef54987c4',
    'e25a8172-1e34-442c-a45d-583027a2d734',
    '0c18589c-6432-4a6c-90ce-985a47a66f39',
    'c911503c-f010-4c17-ac57-1d82e954bdc7',
    '487016d8-ea02-4b20-a45f-7382139aa865',
    '677f0bdf-6c5c-462b-8294-3666f777bbc5',
    '34da5c11-bbe8-4e55-8d89-9ef8a1c66200',
    '9a4d9d5f-72aa-4d7d-90f1-6d1181ee984c',
    'e6b15b39-daba-431f-a918-e4e43e702c30',
    'bef62e8a-5f5c-4e81-a8f8-ddeaf657b4e8',
    '128dce74-fa37-4f2f-8a80-d542edd81a11',
    '841970b7-bed0-4a75-a28a-a4cc59740a84',
    'a5f53fc8-8f9b-4e9a-af63-6f8c54d478b2',
    'f461fca1-80b2-4980-83a8-e165d49acc18',
    '37f0f1f9-83fb-49a1-9941-093c068c32d0',
    'cfb3dad7-c6d9-47c0-81b0-2133d75f5c0d',
    'c5c0893f-b254-4038-8d08-b28ef5a26b5d',
    'bbcf8529-1a04-43fc-b6cf-cb161028159d',
    '06421648-dfcb-4460-b93b-c7d6804dddbb',
    'e0b28b59-1cb5-44f4-ab8f-badf5c74f69f',
    '631deea0-2821-4d14-ad02-dc0ce4864924',
    '95788aa7-c897-4ae8-9166-4b8fc1fc5342',
    'eede320a-ed63-41d8-960d-5405a26a194f',
    '36dccf81-6932-43ae-9864-53379832d878',
    '9102024d-58c0-4bb9-aa55-12c00d98b6cd',
    '01eaf423-8cab-491a-b82e-6915dbc73594',
    '0481a135-9db1-424f-9065-a83ebd7ec995',
    'b60876c5-d825-4303-befb-ffff55b92aba',
    '49022563-1be1-4e42-a11c-01743cd5c94d',
    '64f2dbe1-6f58-493d-ab6b-c93568d828f4',
    'a9626803-72c2-4e23-968c-a090e3f22c5e',
    '095728d6-4ea1-4909-8a74-a8f3fa7f86cb',
    '8309cd89-a912-462a-90ad-f13ae0d7aa6c',
    '902596ce-714e-49b3-8271-f3dfece52309',
    'f5f29e4b-68f7-443c-b290-0827d4167fd5',
    '07c3a7f4-1e59-4dcb-a244-2fd3d084e2b0',
    '6f49717f-8f57-42d0-8548-316ecc292415',
    '7e693091-8ae4-4c40-8e66-c3b39f01b90e',
    '544f643d-b19f-4aa0-a6ec-a90e1a8681d6',
    '5137255f-0c58-4ac7-9266-bda8ab0247c2',
    '5243df74-712d-49a8-989b-528d15088e8f',
    '278a26cb-a710-4fff-928e-fc2e7084a75a',
    '58a1d168-8290-4c69-bf01-17ba3a084365',
    '9fc492f3-8d13-47ae-93e9-812c0224f1aa',
    '822d381e-cea0-45bb-8fa0-1b7194b4b64b',
    '2dc01a50-ea7b-4d9b-be57-1ffbdd98b27b',
    '5b6676dc-f46e-43a8-b87c-e431e369e53c',
    'eb35085f-0cbf-4829-a3ad-acaa53a250b5',
    '36fa2d20-622d-4cca-80b0-683672c94170',
    '44161b51-953d-4f6b-9448-5cba4a44a9d8',
    '0b90b2ea-8ca3-406a-9f69-95eddf7699ef',
    'bfb202bc-4078-4df0-82b9-9218dbc1f1a1',
    '8cc59f51-b0df-4a5d-a3c5-83ee526ff1af',
    'fda7c4b9-9f35-482b-9eff-be7f11058d94',
    '85646f4a-e424-4363-8033-1e7522e8f175',
    '8945794e-174a-49f9-a2d4-4242f9bf3833',
    '529343b4-698a-4b36-ac55-db8a6965ad3f',
    '0f949ee9-0986-42b2-af5d-0f4c8338c664',
    'cb1f06fa-b916-477d-8ab6-fb4b3f24efd3',
    '05253b3c-e8a3-4db4-8a6d-014eac7b3d94',
    '4807db90-b0f7-441d-b489-932f9b341f74',
    'c33b1f32-6021-4d1c-a4d5-fc3d501107f4',
    'aa314675-af62-41df-b5cb-3b22558e903b',
    '20741062-7d1d-44b7-bc33-39c9ad26e414',
    '69ce1be3-1815-43a4-bdd2-4696d9c8d09a',
    '76dd508c-aa80-4e54-9ac4-23b5e0545316',
    '1c6bef41-3cfa-46b2-b183-0a523e417457',
    '18716daf-4223-44a9-bba9-fc9baeef7d07',
    '475430c5-28cb-456d-9c5c-bdbfab9fafb2',
    'f0061cb3-688e-4ad4-aeb8-8614282292ec',
    '6905d8d1-da77-4f7c-86e5-3af7db2b00b4',
    '3a89c170-2939-4c12-9940-f32d96fa9e55',
    '9a06c401-da3f-41b4-b38b-238796fcae09',
    'b32d88c8-31e3-4789-a75f-e52bf1272937',
    '2a81cd6f-aa6e-436b-b4ba-68d5f713fb07',
    '5e0e8f9a-ce97-4b18-9540-3015c61e393c',
    '1c8ba244-1c7f-433a-825b-d2d34d018dcf',
    'efcdb584-7659-4780-9d6d-e6599fb0033c',
    '373ff2e8-0f63-4179-a55c-3fe0b85556aa',
    '352a503b-41eb-4a84-b257-68d70e55337e',
    '737d39b8-2f99-4eac-bcda-a03996e08939',
    '7577f264-8e84-440d-9346-7c4d5affda51',
    'febd8561-4769-4f3b-b7c0-ae7ff6ede2e9',
    'b8c5b185-8669-43d1-8ec7-c0f6d223d505',
    '31e61d00-61cc-46f2-a793-8ea8dfbb0832',
    '2355554e-8951-4b41-bcd8-32e18cddb7c9',
    'de1e7762-673f-4d44-8f45-7e693bb338b7',
    '239a484f-67c2-4ba3-a3d0-d6e4c2b27475',
    'af6c6f09-f0d2-46fe-bda0-c6fa5901c4a3',
    'c6f3bd64-ea67-488f-904f-f0bdf6320b5c',
    '166746e8-ce26-4fa1-a587-443ca9fc59a1',
    '80baf71d-28d0-4bca-81b7-49ddfadfa7a3',
    '263ab7c9-bd69-45dd-abb7-bbf35b9786ed',
    'e0c7877e-75d7-47d8-b5e9-5dd677d03353',
    '49a97523-0a7a-4d5a-ae20-496f86de2032',
    '583023a1-aa12-40e2-a964-8ad50ad400ba',
    'c56f0a76-2b91-4860-8dff-63c9504bb0e2',
    '17d3ffb4-e891-4ac6-a91a-fd52971c1115',
    '5203f051-7e84-4969-b4ce-eda56a859793',
    '488a38ee-f996-482d-a562-a4474f5594de',
    '680d748c-7c60-46e2-aea5-7fc557a916ea',
    '462d992a-7c13-45ac-a6da-1254fc3a9031',
    'dbb4df81-9115-45d1-b51d-875e0669edc4',
    '6fd0f009-3c34-4529-9a38-c59745545490',
    '74d1e549-5ae8-4410-9428-f8f2cc85fa80',
    '868f72af-99e8-406e-9f7e-14577e6c7157',
    '73f7d2b4-86ec-4f7e-a1f9-37c7b023e3bf',
    '595b6755-e7ae-4e83-af2e-693c089aeec3',
    '4d01e12e-503e-4447-8e49-8c2b77ffb00d',
    '732eaae3-b509-4a7a-8961-09d861e55253',
    '279e5670-8a47-4992-bb10-14e6c719db97',
    'd306000b-88c1-4220-8d7e-933c0118a983',
    '703c4bc2-81bf-435a-87fa-21dc9278bad6',
    'ab76b5ca-e464-4063-b949-853f61036370',
    '672b617f-936e-440a-a735-80f94798aed1',
    '516ceb43-1378-4c02-88fc-a1d2a2258d59',
    'c9986260-0c1b-4fd3-8132-6fa7353046e6',
    'bda2bec8-a142-47ab-bfb4-83759ac2bddd',
    'e858d4f9-3385-4640-b0cb-4894e86d501c',
    '39fe0c8b-bd78-4565-9415-63eabc1d6d85',
    'da4e904f-0346-4cd3-a5c2-ba932511d98d',
    'c46c2220-da88-4f60-a0cf-eebfd0a8ff12',
    '629e31cb-dd7b-4345-abf2-fa23c6c65a09',
    'e9a57082-5a93-481a-bbd0-1acb03ac751a',
    '0faf149d-b316-4fbd-8605-a59354f0eacd',
    '1d23d3cc-5db6-4734-bfaa-507dd366d99b',
    'dd58f556-0049-49c3-9a51-d6470a2abddc',
    '53185d06-f2cc-4942-88c4-8534b559a9ff',
    '0132f320-830d-40d0-a4da-06a5d5f9e8d9',
    'bb7d6408-941a-4da6-8613-36498bc6d91b',
    'b8d11ca1-3db8-4efa-bf57-0305e004a26d',
    '9bd56ad6-080e-4d26-acca-83e4df8aa913',
    '00c11c7e-8530-4bfc-abd7-8c10f4c602d3',
    '8f6b9e20-9468-4f46-aa45-eeab9de88e53',
    '272dff18-acf3-4874-a55f-ba8fb6f80352',
    '70ac3659-06bb-4022-be55-af81d3e35b6f',
    '409b92cf-5c4d-4997-9736-ef2ea10d19e9',
    'a8636719-e26c-49b6-9a53-7d77f3d3c94b',
    'a3e81d5f-8dd6-43dd-9172-d80d212efa2d',
    'd40af129-c13f-45b2-92f0-d0e8fa5cc1c9',
    'ecd2d2f9-2b6f-4743-8d04-c9bb554a96cb',
    '9ee78822-7acd-4fab-9999-c58e9fe266ad',
]

# Loop through datasets and pull results
df_results = pd.DataFrame(columns = ["dataset_id", "table", "column", "value", "row_count", "status"])
for dataset_id in dataset_id_list:
    
    # Establish API client and pull dataset details
    print(f"Processing dataset_id {dataset_id}...")
    api_client = utils.refresh_tdr_api_client()
    datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
    try:
        response = datasets_api.retrieve_dataset(id=dataset_id, include=["SCHEMA", "ACCESS_INFORMATION"]).to_dict()
        bq_project = response["access_information"]["big_query"]["project_id"]
        bq_schema = response["access_information"]["big_query"]["dataset_name"]
    except Exception as e:
        print("Failure - Issue Retrieving Dataset Info")
        continue
    
    # Loop through non-array fields and pull data
    for field in nonarray_field_list:
        table_name = field.split(".")[0]
        field_name = field.split(".")[1]
        client = bigquery.Client()
        query = f"""SELECT '{dataset_id}' AS dataset_id, '{table_name}' AS table, '{field_name}' AS column, {field_name} AS value, COUNT(*) AS row_count, 'Success' AS status FROM `{bq_project}.{bq_schema}.{table_name}` GROUP BY {field_name}"""
        try:
            df_output = client.query(query).result().to_dataframe()
            if df_output.empty:
                df_output = pd.DataFrame(data={"dataset_id": [dataset_id], "table": [table_name], "column": [field_name], "value": [None], "row_count": [0], "status": ["Success"]}) 
        except Exception as e:
            print(f"Failure - Issue Pulling Data for {field}")
            df_output = pd.DataFrame(data={"dataset_id": [dataset_id], "table": [table_name], "column": [field_name], "value": [None], "row_count": [0], "status": ["Failure"]})
        df_results = pd.concat([df_results, df_output])
    
    # Loop through array fields and pull data 
    for field in array_field_list:
        table_name = field.split(".")[0]
        field_name = field.split(".")[1]
        client = bigquery.Client()
        query = f"""SELECT '{dataset_id}' AS dataset_id, '{table_name}' AS table, '{field_name}' AS column, {field_name}_unnested AS value, COUNT(*) AS row_count, 'Success' AS status FROM `{bq_project}.{bq_schema}.{table_name}` CROSS JOIN UNNEST({field_name}) AS {field_name}_unnested GROUP BY {field_name}_unnested"""
        try:
            df_output = client.query(query).result().to_dataframe()
            if df_output.empty:
                df_output = pd.DataFrame(data={"dataset_id": [dataset_id], "table": [table_name], "column": [field_name], "value": [None], "row_count": [0], "status": ["Success"]}) 
        except Exception as e:
            print(f"Failure - Issue Pulling Data for {field}")
            df_output = pd.DataFrame(data={"dataset_id": [dataset_id], "table": [table_name], "column": [field_name], "value": [None], "row_count": [0], "status": ["Failure"]})
        df_results = pd.concat([df_results, df_output])

# Aggregate and display final results
print("\nDataset Level Results:")
df_sorted = df_results.sort_values(["dataset_id", "table", "column", "row_count"], ascending=[True, True, True, False], ignore_index=True)
df_sorted_path = "search_facet_values.tsv"
df_sorted.to_csv(df_sorted_path, index=False, sep="\t")
!gsutil cp $df_sorted_path $ws_bucket/ingest_pipeline/resources/search/ 2> stdout
!rm $output_file_path

display(df_sorted)
print("\nAggregated Results:")
df_results_agg = df_results.groupby(["table", "column", "value"])["row_count"].sum().reset_index().sort_values(["table", "column", "row_count"], ascending=[True, True, False], ignore_index=True)
df_results_agg_path = "search_facet_values_agg.tsv"
df_results_agg.to_csv(df_results_agg_path, index=False, sep="\t")
!gsutil cp $df_results_agg_path $ws_bucket/ingest_pipeline/resources/search/ 2> stdout
display(df_results_agg)
        

Processing dataset_id cefc1a79-446c-40d2-b140-ba8d8b1c0712...
Processing dataset_id 4e699ead-bbb5-460d-9b32-2b1b322c601b...
Processing dataset_id cefc1a79-446c-40d2-b140-ba8d8b1c0712...
Processing dataset_id 2355554e-8951-4b41-bcd8-32e18cddb7c9...
Processing dataset_id bafbf771-1cd2-44fc-9b38-5a4bbead8ab2...
Processing dataset_id a36eeaf7-d6dd-4887-bdbd-e435a07ba156...
Processing dataset_id 8de6dae2-55ff-4287-9b75-5b2a950c1f44...
Processing dataset_id ce6692aa-0f97-48fa-8628-b8fa3eab4726...
Processing dataset_id 31433635-91d4-431d-8d26-bc54e84c8e8c...
Processing dataset_id 0b06619d-39d9-4437-8c42-2e415faa634c...
Processing dataset_id 12ffb586-5f6a-4f0a-a353-d2f34599f4cc...
Processing dataset_id e642bca0-52fb-4ab3-ab3a-acaab83deda7...
Processing dataset_id b7fb531e-25a4-427c-9679-b7bdc3d03535...
Processing dataset_id 3615e063-f24b-47f7-87cb-430e8aca8d0c...
Processing dataset_id a3ea4f97-6657-4d3c-9be6-96f097f5c952...
Processing dataset_id 9f4ac69c-0919-4ac1-98a8-976ed79ace03...
Processi

Processing dataset_id bd492b71-b20e-4056-b8ae-ad8c94cfbc02...
Processing dataset_id 9ecc231f-e3d3-4417-a98a-c4db4c638161...
Processing dataset_id 3fb2d04a-d18b-4bdc-9372-99b992f2ae42...
Processing dataset_id e922a496-e686-4fa1-911d-2159ceb0f09f...
Processing dataset_id 8fbfea50-6a71-4b19-98e9-f95e3a8594c7...
Processing dataset_id d911e57a-ebb8-4be8-876b-d8e5790ddce3...
Processing dataset_id 6c9423a2-3ea7-4c3c-9b12-0cc993bc095f...
Processing dataset_id 52e015b5-22b7-4a96-9f0a-ea3afccbfcbc...
Processing dataset_id 325f3ee8-2adb-4092-bd78-1b5ea5b0d1d6...
Processing dataset_id ae50ef98-ef3d-4427-b094-83b2d90787a0...
Processing dataset_id e6771964-50e9-482f-9d23-18c22cd89ab8...
Processing dataset_id 3fbacc64-4c53-4770-8cdf-a616c10ec5c7...
Processing dataset_id a3ae33bb-8b3a-47e5-a2d1-a49c954776b3...
Processing dataset_id 0b0a52bb-a1a2-4638-9259-4447761c2da4...
Processing dataset_id 0eb42259-7b44-450f-a9d7-500b2ea7179c...
Processing dataset_id e16adabb-88e0-4739-983a-98ac5c181842...
Processi

Processing dataset_id 8e88cabc-e713-44ed-a5d2-41935c3b4eb5...
Processing dataset_id be8cfc23-cd19-46fb-92e1-a77ac380d7aa...
Processing dataset_id e2a398ff-18c3-4258-9d75-89adb2923e88...
Processing dataset_id cb8ebcd0-bb5e-4a6d-bfef-5c651a1a9f6e...
Processing dataset_id 6238f8f7-5efb-4023-8d85-ef7db9b4dad7...
Processing dataset_id 32c09444-3d4a-44d5-af6b-07eef92189db...
Processing dataset_id d7686f98-05a4-45c9-af2e-3ebc524a5b2d...
Processing dataset_id 1939b7ae-fc6b-42a8-ad5f-dc51a1682a17...
Processing dataset_id 8ccefc59-38a5-476f-b7d3-3f98315a97f0...
Processing dataset_id 2cda53ba-b852-47e8-8f24-59ab8e9f1d1f...
Processing dataset_id 6e67e1e1-5c39-43da-960f-48385789c4e1...
Processing dataset_id 92382848-f5e9-426c-b7dc-f2841ae97018...
Processing dataset_id 4999a410-990e-484b-b4f3-d636f894a741...
Processing dataset_id 1f534eb4-701f-4182-9895-64c5e5b52d82...
Processing dataset_id d01a4268-1bfe-4a2d-a2d4-e296162c406e...
Processing dataset_id feca4815-b44b-4b2b-8d77-75edd62ba5a6...
Processi

Processing dataset_id 841970b7-bed0-4a75-a28a-a4cc59740a84...
Processing dataset_id a5f53fc8-8f9b-4e9a-af63-6f8c54d478b2...
Processing dataset_id f461fca1-80b2-4980-83a8-e165d49acc18...
Processing dataset_id 37f0f1f9-83fb-49a1-9941-093c068c32d0...
Processing dataset_id cfb3dad7-c6d9-47c0-81b0-2133d75f5c0d...
Processing dataset_id c5c0893f-b254-4038-8d08-b28ef5a26b5d...
Processing dataset_id bbcf8529-1a04-43fc-b6cf-cb161028159d...
Processing dataset_id 06421648-dfcb-4460-b93b-c7d6804dddbb...
Processing dataset_id e0b28b59-1cb5-44f4-ab8f-badf5c74f69f...
Processing dataset_id 631deea0-2821-4d14-ad02-dc0ce4864924...
Processing dataset_id 95788aa7-c897-4ae8-9166-4b8fc1fc5342...
Processing dataset_id eede320a-ed63-41d8-960d-5405a26a194f...
Processing dataset_id 36dccf81-6932-43ae-9864-53379832d878...
Processing dataset_id 9102024d-58c0-4bb9-aa55-12c00d98b6cd...
Processing dataset_id 01eaf423-8cab-491a-b82e-6915dbc73594...
Processing dataset_id 0481a135-9db1-424f-9065-a83ebd7ec995...
Processi

Unnamed: 0,dataset_id,table,column,value,row_count,status
0,00bd45f9-beb2-4fb0-8680-bd30e392975a,anvil_biosample,anatomical_site,,910,Success
1,00bd45f9-beb2-4fb0-8680-bd30e392975a,anvil_biosample,biosample_type,,910,Success
2,00bd45f9-beb2-4fb0-8680-bd30e392975a,anvil_dataset,consent_group,,0,Success
3,00bd45f9-beb2-4fb0-8680-bd30e392975a,anvil_dataset,data_modality,,0,Success
4,00bd45f9-beb2-4fb0-8680-bd30e392975a,anvil_dataset,registered_identifier,phs001489,1,Success
5,00bd45f9-beb2-4fb0-8680-bd30e392975a,anvil_diagnosis,disease,,0,Success
6,00bd45f9-beb2-4fb0-8680-bd30e392975a,anvil_donor,organism_type,Homo sapiens,910,Success
7,00bd45f9-beb2-4fb0-8680-bd30e392975a,anvil_donor,phenotypic_sex,,910,Success
8,00bd45f9-beb2-4fb0-8680-bd30e392975a,anvil_donor,reported_ethnicity,,0,Success
9,00bd45f9-beb2-4fb0-8680-bd30e392975a,anvil_file,file_format,.idat,1820,Success



Aggregated Results:


Unnamed: 0,table,column,value,row_count
0,anvil_biosample,anatomical_site,Unknown,123443
1,anvil_biosample,anatomical_site,Brain,4348
2,anvil_biosample,anatomical_site,Esophagus,2466
3,anvil_biosample,anatomical_site,Skin,2186
4,anvil_biosample,anatomical_site,Heart,1461
5,anvil_biosample,anatomical_site,Colon,1286
6,anvil_biosample,anatomical_site,Buccal Mucosa,1251
7,anvil_biosample,anatomical_site,Thyroid,1099
8,anvil_biosample,anatomical_site,Lung,1007
9,anvil_biosample,anatomical_site,Breast,749


# Pull Study/Consent Info Across Datasets

In [None]:
#############################################
## Functions
#############################################

def check_phs_and_consent(dataset_id_list):
    
    # Loop through and process dataset IDs
    results = []
    for dataset_id in dataset_id_list:
    
        # Retrieve dataset information
        logging.info(f"Processing dataset_id = {dataset_id}...")
        api_client = utils.refresh_tdr_api_client()
        datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
        try:
            response = datasets_api.retrieve_dataset(id=dataset_id, include=["ACCESS_INFORMATION"]).to_dict()
            bq_project = response["access_information"]["big_query"]["project_id"]
            bq_dataset = response["access_information"]["big_query"]["dataset_name"]
        except Exception as e:
            error_message = f"Error retrieving dataset details: {str(e)}"
            logging.error(error_message)
            results.append([dataset_id, "All", "Failure", error_message])
            continue
            
        # Review the recorded PHS ID and consent code
        client = bigquery.Client()
        query = """SELECT title, registered_identifier, consent_group, data_use_permission FROM `{project}.{dataset}.anvil_dataset`""".format(project=bq_project, dataset=bq_dataset)
        try:
            df = client.query(query).result().to_dataframe()
            results.append([dataset_id, df["title"].values[0], "".join(df["registered_identifier"].values[0]), "".join(df["consent_group"].values[0]), "".join(df["data_use_permission"].values[0]), "Success", ""])
        except Exception as e:
            error_message = f"BigQuery error: {str(e)}"
            results.append([dataset_id, None, None, None, None, "Failure", error_message])
            continue
                
    # Display results
    logging.info("\nResults:")
    df_results = pd.DataFrame(results, columns =["Dataset ID", "Title", "PHS ID", "Consent Group", "Data Use Permission", "Status", "Message"])
    display(df_results)


#############################################
## Input Parameters
#############################################

# List of dataset IDs to examine
dataset_id_list = [
    'e2a398ff-18c3-4258-9d75-89adb2923e88',
    'd7bcfc5d-e258-4bd6-a413-bb7a118e6bff',
    '8da05494-fe7a-4af5-b257-bada143ee426',
    '8b098ab4-df02-4619-8ded-657e496695c1',
    '8e88cabc-e713-44ed-a5d2-41935c3b4eb5',
    'be8cfc23-cd19-46fb-92e1-a77ac380d7aa',
    '6c9423a2-3ea7-4c3c-9b12-0cc993bc095f',
    'ae50ef98-ef3d-4427-b094-83b2d90787a0',
    '3fbacc64-4c53-4770-8cdf-a616c10ec5c7',
    '0b0a52bb-a1a2-4638-9259-4447761c2da4',
    '0eb42259-7b44-450f-a9d7-500b2ea7179c',
    'e16adabb-88e0-4739-983a-98ac5c181842',
    '0e65b131-fd14-4fce-908b-c5b89a71a9c1',
    '352a503b-41eb-4a84-b257-68d70e55337e',
    '7577f264-8e84-440d-9346-7c4d5affda51',
    'febd8561-4769-4f3b-b7c0-ae7ff6ede2e9',
    'b8c5b185-8669-43d1-8ec7-c0f6d223d505',
    'de1e7762-673f-4d44-8f45-7e693bb338b7',
    '74d1e549-5ae8-4410-9428-f8f2cc85fa80',
    'db266afc-2f75-4b03-a3b8-c69e0ce6f713',
    '12bbfa4c-c30a-4cf6-b79f-45354f842964',
    '84ac0d05-4be5-43e9-973e-ef999144d802',
    'ff8b1212-858a-4048-8f63-9464c922591a',
    'c814d754-cdc5-4b0d-8671-a39e85b2c473',
    '797b2563-5d56-4f5c-bdaf-3bfd11e8f5b3',
    '7eeede5a-c86f-4577-9f3c-65ab618a6dee',
    'a52c04ee-cfef-46bb-9b40-6a9b292e1a7b',
    '0194eea9-d779-4957-8521-11717a378e66',
    '0d82658c-44b3-4cea-a388-3353a96a31ef',
    '77dca0d5-4d22-4415-8858-075590d25cb5',
    '1b05159b-6277-4345-9d59-f7bba5ea1d56',
    '92299ff4-c0d0-4e94-b374-75d0038cbd68',
    '60f96582-79ad-4461-9f9a-53c1bc3d17b6',
    'b5d7c34a-c383-4fc7-aa4d-b6dc941cd41a',
    '85baa8f8-619c-4165-9d3e-53220f645814',
    'c423b18d-12f2-43e4-97f9-993e2943270e',
    'ceee2791-0fdf-45fc-a4e8-8077916771aa',
    '2a263db0-8c33-4171-840f-54bf4755a4b9',
    '9828f3fe-f676-4bf1-b600-5effa24ea9c8',
    '516ceb43-1378-4c02-88fc-a1d2a2258d59',
    'bda2bec8-a142-47ab-bfb4-83759ac2bddd',
    '629e31cb-dd7b-4345-abf2-fa23c6c65a09',
    '956cd931-7077-4a08-9c75-ab8b4e5d1eb8',
    '31a42df0-29f6-4d4e-ae5c-2e13abc355f2',
    '13364604-ed08-4a61-89cd-65eb372ac8c3',
    '53185d06-f2cc-4942-88c4-8534b559a9ff',
    '22199347-9454-41e5-8912-eb38edd33a25',
    '0132f320-830d-40d0-a4da-06a5d5f9e8d9',
    '60cadee3-9e63-4897-ac81-4fb283033648',
    'bb7d6408-941a-4da6-8613-36498bc6d91b',
    'dcd4112f-09d3-43ed-8441-df9bf4c9ddc8',
    'b8d11ca1-3db8-4efa-bf57-0305e004a26d',
    '9bd56ad6-080e-4d26-acca-83e4df8aa913',
    'da02c3f1-371b-4afe-9b5e-b8c584fd5907',
    '8f6b9e20-9468-4f46-aa45-eeab9de88e53',
    '72f73fc5-6a3a-43a0-8cce-09f4726b736c',
    '72e639d1-b8c5-45fd-9acd-a8e5e2b7fa0d',
    '8681cdcf-b775-4b56-aace-3f3e448261ef',
    'f0db3b27-c952-477d-bc33-9b96a250e168',
    'b12fb9be-2ce0-4bfd-8503-732fabba06ab',
    'a963c15d-9c97-49e4-af95-cdee96333a76',
    'fb5d9952-ebe7-4ee6-ba00-819ed00f3593',
    '76dd508c-aa80-4e54-9ac4-23b5e0545316',
    '1c6bef41-3cfa-46b2-b183-0a523e417457',
    '6905d8d1-da77-4f7c-86e5-3af7db2b00b4',
    '3a89c170-2939-4c12-9940-f32d96fa9e55',
    'efcdb584-7659-4780-9d6d-e6599fb0033c',
    '373ff2e8-0f63-4179-a55c-3fe0b85556aa',
    '737d39b8-2f99-4eac-bcda-a03996e08939',
    '31e61d00-61cc-46f2-a793-8ea8dfbb0832',
    'af6c6f09-f0d2-46fe-bda0-c6fa5901c4a3',
    '3fdcdafd-5328-418b-85f8-47b0006de468',
    'c6f3bd64-ea67-488f-904f-f0bdf6320b5c',
    '166746e8-ce26-4fa1-a587-443ca9fc59a1',
    '595b6755-e7ae-4e83-af2e-693c089aeec3',
    'a3ea4f97-6657-4d3c-9be6-96f097f5c952',
    '9f4ac69c-0919-4ac1-98a8-976ed79ace03',
    '96461004-f4b3-4f82-a842-293b3ec46a60',
    '841b7883-9447-4ea0-ae4a-84ea0240d919',
    'd0fc3d6a-c3f4-4533-8a23-817a4e27f9be',
    '2cace5dc-f660-45d4-b689-c4c89e77697c',
    'd2272f2d-c606-4027-b8ea-0bdd6d9d6535',
    '179eb85e-2557-4677-9cba-d763310f3df9',
    '13b2076a-cfe1-49ec-ac61-bad1af9a52ea',
    '175dd803-02c7-4823-81d5-9e0621652ace',
    '15492baa-05ed-47bc-b50c-e587679ae51a',
    '33705ce9-b2b3-4edc-9b47-f54283e193cf',
    '92486440-3a46-44dd-b853-b300ef75b31e',
    '3725b660-1106-4173-9c4b-0a15926becf5',
    '8b8185d3-ba5c-4832-af23-3ff8ca6ed016',
    'ec97fa0f-e174-40fe-a6b8-ee240bdf4318',
    '5488d7c1-5195-4ebc-b0f0-31033fa06dc9',
    '56f9888f-e623-4a1a-b2b4-46378a6cd6fe',
    'b2e7f15b-65d5-4812-abfd-b2dbc6d18850',
    '69f8d7c2-2e14-48e6-b838-7881016313fb',
    '809fa952-3178-46b7-bb82-8a476ef32e67',
    'dfb14a1f-38b2-4668-b98f-59b5b5b53ca3',
    'f553b765-1c9c-464c-a8fa-07700a1691c5',
    'b108dfd0-711d-4bc1-aab5-1b312226c8ad',
    '38eca26c-d79e-4447-99d3-1889d20ade21',
    'd5a0e24d-689a-4854-92c7-9a39f980b523',
    'ecd0606f-4fa7-4e57-b6e8-eea377e65d5b',
    '0c6bc810-5ae0-4926-a56b-2bc2fe7dbe6d',
    '9dc31133-c882-4f39-903a-a25f316bb560',
    '23a0ede0-4f97-46af-9f04-bd2805050980',
    '7593c1c2-3680-4bf5-8a65-dce5f96a3b59',
    '02661394-2886-4ef7-aff1-d53225c82025',
    'c1644d4e-06e2-4fa8-95f1-5c1da5831257',
    '267cf516-dd33-4640-a71a-78bd8f5db9d8',
    'df06ff22-6a2d-4934-aac9-c8368efbea1a',
    'dcdefb14-f6de-4c46-ac7e-842b273416bf',
    '28208cc6-50bf-4864-9a48-981632066640',
    'ccfe264d-a35e-44f8-9b2b-241a0f8327cc',
    'f177843d-47fb-46ae-83be-73c92ee85081',
    'd049d487-1a69-4358-8dad-0e6fa6c06fdd',
    'da29226b-e856-4014-8c8b-c4268d0df2cc',
    '714dcdbd-8d17-40b0-8246-0e941af8175d',
    '53ce7d12-facc-4412-a710-f535efb209a2',
    'b9842819-5fd6-40c5-9668-aae1ea44a308',
    '7c056125-3ed8-459c-b73e-edfa3f80cc27',
    'ed1215f1-787c-40f5-9d77-4b5bc2dfbb84',
    '7feeb2b1-1926-4968-b6dc-e0a1e4cf8d4a',
    'dbf5d87b-4cab-44cd-a792-1d0218aad973',
    '85dbde76-c130-40b2-8a8a-ba815ba499da',
    '1c2fe11d-b020-4c54-8c71-1ea91623d626',
    '84133066-68cd-41fa-819b-d74a3ac85862',
    '54c6fa73-9b84-4a3b-9e97-e4e43165c48b',
    '90ba1853-f845-4502-ba36-b75b9e571bc5',
    'f3c88c3c-8e1b-4af9-9467-0621404e314c',
    '332bb145-6ef1-40ef-932c-aec5bb6210d9',
    '44f83f20-d618-40b5-b2cb-3676b8fe3ad7',
    '3c2c39a9-4cc2-4f7c-89e0-054a871e2c4e',
    '2ebb722f-a3df-4ea4-b72a-813e3db0bab5',
    '1817528a-4f88-4ed6-8965-9eae0220ab27',
    'e03eb011-05f9-4491-b779-0cc2aefabff1',
    'd4bb7169-5a7d-4090-ba62-12ea799c3ade',
    '3f172982-060d-4339-a09b-6994c2c9eb16',
    '51789659-5233-4ee7-8bca-dedebfc87773',
    'a5fe75bb-d28c-42fb-aaf8-92fa37b266d2',
    '6545d602-e5b4-4dd1-8f6a-64e0a1952ddc',
    'e5c79b74-20d5-4b6f-8085-0bc788eed2ea',
    'f492567d-6db8-45c8-b44e-6b5def26c812',
    '12e54f96-00d9-4a38-921e-e0d42610b2c1',
    '845b131e-7c05-4397-ad40-23dea8e9b399',
    'bd492b71-b20e-4056-b8ae-ad8c94cfbc02',
    '9ecc231f-e3d3-4417-a98a-c4db4c638161',
    '3fb2d04a-d18b-4bdc-9372-99b992f2ae42',
    'e922a496-e686-4fa1-911d-2159ceb0f09f',
    '52e015b5-22b7-4a96-9f0a-ea3afccbfcbc',
    'a3ae33bb-8b3a-47e5-a2d1-a49c954776b3',
    '5627cdbb-22a0-436f-a7a4-34d7ce21bb45',
    '655e6a61-5400-4d8a-95bc-1506e026b289',
    '02ff1051-cd1d-4bbb-a005-21384cbff846',
    '35a1009d-93a2-49b1-a801-fe84d6b7a2f5',
    '50132478-c9fb-4dc5-86cd-d5dfab909393',
    'f1513955-0264-4733-bd25-3f752c61a323',
    '93e712f2-3e54-466e-aa53-57eb69c43bc0',
    '2cbe079d-e7ab-47d8-836e-454a71440297',
    '84fad495-2756-472f-ad20-f91de6f67baf',
    '28e73469-12d4-493b-bf6f-83359c1f69c5',
    'dd6866e4-8949-45bd-8910-8ce64f79e3c7',
    '85287d84-fefe-40df-ad40-5b135ee0c07f',
    'cc107de7-d623-464a-a875-c8b7ae5fb09d',
    '416b8daa-9537-46db-ae7b-3f5ff5f01dc3',
    '61940344-e6c1-484e-ba10-131f43a9b13a',
    '5205f817-7de0-48b4-89fc-6398cf13bff7',
    'f757278a-3c74-4690-bf89-5149d21ff3af',
    '5a103ab3-29c3-4d07-a0f6-4999c256cf26',
    '28849dc9-a97f-469b-b2ac-a8ff97693f02',
    '29cd0578-fb47-495a-8f48-b37325eed81a',
    'ba503d2e-48af-48bb-910a-be41790d921c',
    '472f01ad-7bc3-4fe5-9771-2695930dbc95',
    'b724164c-712c-4615-97b7-529a108a753a',
    'f6565f2f-4478-45ad-8c11-04dd242fc6a9',
    '275ea204-4612-4d3c-ac0d-f110f61d62ad',
    'be72f1e3-b5f5-43f4-80db-6d7de93a654e',
    'b6bf4699-6f61-4c6a-9d42-ad055a0de008',
    '8abf299c-cd4e-4ce0-b5cf-4f9abe8cc891',
    '0b6eb077-2eca-4fe6-b012-26fab725b907',
    '5c659e81-e687-4710-a4fd-000ca593155d',
    '42965913-4223-484a-9b3d-abc0002d277d',
    'd6823ccd-7247-4efc-8841-f53f456351ed',
    '1048a860-d5ff-4f61-95e5-851e1266d4c1',
    '9e1a6a7d-b45e-4fd2-a1ff-df131da4c713',
    '032d39fb-d278-427d-b7d2-de648a25a20c',
    '27acea14-41c9-4bf9-ad43-3ebb3ce90456',
    'ec6f49a2-176c-4564-82c5-e751baab46aa',
    '93b2ac60-2208-4ef8-a1c2-68a623e45807',
    'd48adc59-8934-41bb-9720-63e71f1933be',
    '6238f8f7-5efb-4023-8d85-ef7db9b4dad7',
    '32c09444-3d4a-44d5-af6b-07eef92189db',
    '280c5d6f-39a3-4d1d-aad2-a174451cd9b2',
    '2d434f2c-6aaa-46b2-ada9-de4b887e13d3',
    'bad1fb5c-d263-48d7-8e4c-fa873a17d707',
    'e4ccd185-2b0c-445d-9c57-0dc45c8f9d7e',
    '20ddfcd5-d456-431b-9f05-781e05d873d6',
    '15d41c35-943c-474b-afa6-e1c6d6e4be2b',
    '3ef7966a-ec1e-4dba-9d31-cdb33692e78f',
    'dd6c6688-b73a-464c-86d9-3369fdf98268',
    '15b153f5-ed02-4216-8f96-99743b8b4fc3',
    '747858c0-d139-4f52-9f0e-a618b880d6d6',
    '3376a8b6-7ef6-4191-97ab-a547da0d330d',
    'f85e467a-958f-4da5-a01b-8df883e69122',
    '0b25d09e-b2d9-4452-9810-1d0ef777f9d6',
    '6ac178b7-a923-407f-8cd8-1733e1b2ebd5',
    'c9dd3578-01db-4687-9807-4f71368941d1',
    '5edcc3db-c676-412a-9506-600959bb81f2',
    'ccc524ab-d9ad-467c-a25b-9a14fb05e976',
    '4b341ba9-49a5-43a2-9b7e-cc96beb59946',
    '0c18589c-6432-4a6c-90ce-985a47a66f39',
    'e6b15b39-daba-431f-a918-e4e43e702c30',
    '841970b7-bed0-4a75-a28a-a4cc59740a84',
    'a5f53fc8-8f9b-4e9a-af63-6f8c54d478b2',
    'e0b28b59-1cb5-44f4-ab8f-badf5c74f69f',
    'aa314675-af62-41df-b5cb-3b22558e903b',
    '20741062-7d1d-44b7-bc33-39c9ad26e414',
    '69ce1be3-1815-43a4-bdd2-4696d9c8d09a',
    '18716daf-4223-44a9-bba9-fc9baeef7d07',
    '475430c5-28cb-456d-9c5c-bdbfab9fafb2',
    'f0061cb3-688e-4ad4-aeb8-8614282292ec',
    '9a06c401-da3f-41b4-b38b-238796fcae09',
    'b32d88c8-31e3-4789-a75f-e52bf1272937',
    '2a81cd6f-aa6e-436b-b4ba-68d5f713fb07',
    '5e0e8f9a-ce97-4b18-9540-3015c61e393c',
    '1c8ba244-1c7f-433a-825b-d2d34d018dcf',
    'c56f0a76-2b91-4860-8dff-63c9504bb0e2',
    '17d3ffb4-e891-4ac6-a91a-fd52971c1115',
    '5203f051-7e84-4969-b4ce-eda56a859793',
    '680d748c-7c60-46e2-aea5-7fc557a916ea',
    '462d992a-7c13-45ac-a6da-1254fc3a9031',
    '4bc5b4eb-da91-48f7-bca0-134ed1a484a0',
    '868f72af-99e8-406e-9f7e-14577e6c7157',
    '73f7d2b4-86ec-4f7e-a1f9-37c7b023e3bf',
    '732eaae3-b509-4a7a-8961-09d861e55253',
    'ab76b5ca-e464-4063-b949-853f61036370',
    'e858d4f9-3385-4640-b0cb-4894e86d501c',
    '39fe0c8b-bd78-4565-9415-63eabc1d6d85',
    'da4e904f-0346-4cd3-a5c2-ba932511d98d',
    'c46c2220-da88-4f60-a0cf-eebfd0a8ff12',
    'e9a57082-5a93-481a-bbd0-1acb03ac751a',
    '0faf149d-b316-4fbd-8605-a59354f0eacd',
    '1d23d3cc-5db6-4734-bfaa-507dd366d99b',
    'dd58f556-0049-49c3-9a51-d6470a2abddc',
    '00c11c7e-8530-4bfc-abd7-8c10f4c602d3',
    '272dff18-acf3-4874-a55f-ba8fb6f80352',
    '70ac3659-06bb-4022-be55-af81d3e35b6f',
    '409b92cf-5c4d-4997-9736-ef2ea10d19e9',
    'a8636719-e26c-49b6-9a53-7d77f3d3c94b',
    'a3e81d5f-8dd6-43dd-9172-d80d212efa2d',
    'd40af129-c13f-45b2-92f0-d0e8fa5cc1c9',
    'ecd2d2f9-2b6f-4743-8d04-c9bb554a96cb',
    '9ee78822-7acd-4fab-9999-c58e9fe266ad',
]

#############################################
## Execution
#############################################

check_phs_and_consent(dataset_id_list)


# Pull Metadata for CCDG

## Pulling Sample Metadata for CCDG

In [None]:
# Define parameters
billing_profile = "e0e03e48-5b96-45ec-baa4-8cc1ebf74c61"

# Datasets
dataset_id_list = [
    'd6518df9-fc11-46ed-9c12-b9782d3829a0',
    '9ee2a552-89f8-4a48-9c94-9fa26ebb7483',
    '425412ba-894a-4824-acb8-bf18fe4576e0',
    'f22bd762-5c45-453e-bf22-b174514abb84',
    '0ee62643-b064-42f8-9b09-5d10eacd70a3',
    '1a7f6728-5116-4f24-897a-59a7f322cfd2',
    'c37b388c-7107-43d6-bee6-4e82b40ed271',
    'bf6f1d78-6a0d-4afb-aea6-17a3c34340db',
    'a3becdde-018b-46f0-adea-d587076eef4a',
    'a9ad3a05-24fb-4e59-85b0-ee09e55a4492',
    'd56ae233-d6d2-483c-917e-1de0fe1cfeb7',
]

# Establish API client
api_client = utils.refresh_tdr_api_client()
datasets_api = data_repo_client.DatasetsApi(api_client=api_client)

# Loop through enumerated datasets and create records for those related to AnVIL
print(f"Start time: {datetime.datetime.now()}")

df_results = pd.DataFrame(columns = ["dataset_id", "source_workspace", "sample_id", "chip_well_barcode", "collaborator_participant_id", "collaborator_sample_id"])
for dataset_id in dataset_id_list:
    
    # Retrieve dataset details and pull source workspace(s)
    print(f"Processing dataset_id = '{dataset_id}'...")
    dataset_details = datasets_api.retrieve_dataset(id=dataset_id, include=["ACCESS_INFORMATION", "PROPERTIES"]).to_dict()
    try:
        source_workspace = ",".join(dataset_details["properties"]["source_workspaces"])
    except:
        source_workspace = ""
    
    # Pull sample data
    bq_project = dataset_details["access_information"]["big_query"]["project_id"]
    bq_schema = dataset_details["access_information"]["big_query"]["dataset_name"]
    client = bigquery.Client()
    query = f"""SELECT *, '{dataset_id}' AS dataset_id, '{source_workspace}' AS source_workspace,  FROM `{bq_project}.{bq_schema}.sample`"""
    try:
        df_output = client.query(query).result().to_dataframe()
        present_col_list = [col for col in list(df_output.columns.values) if col in ["dataset_id", "source_workspace", "sample_id", "chip_well_barcode", "collaborator_participant_id", "collaborator_sample_id"]]
        df_results = pd.concat([df_results, df_output[present_col_list]], ignore_index=True)
    except Exception as e:
        print(f"Error pulling data for dataset_id = '{dataset_id}'. Error: {str(e)}")
                                                                           
# Sort dataframe records and write out to file
df_sorted = df_results.sort_values(["source_workspace", "dataset_id", "sample_id"], ascending=[True, True, True], ignore_index=True)
output_file_path = "ccdg_cvd_sample_metadata.tsv"
df_sorted.to_csv(output_file_path, index=False, sep="\t")
!gsutil cp $output_file_path $ws_bucket/ingest_pipeline/resources/misc/ 2> stdout
!rm $output_file_path
print(f"End time: {datetime.datetime.now()}")
print(f"Results copied to: {ws_bucket}/ingest_pipeline/resources/misc/{output_file_path}")

Start time: 2024-08-02 14:47:17.608014
Processing dataset_id = 'd6518df9-fc11-46ed-9c12-b9782d3829a0'...
Processing dataset_id = '9ee2a552-89f8-4a48-9c94-9fa26ebb7483'...
Processing dataset_id = '425412ba-894a-4824-acb8-bf18fe4576e0'...
Processing dataset_id = 'f22bd762-5c45-453e-bf22-b174514abb84'...
Processing dataset_id = '0ee62643-b064-42f8-9b09-5d10eacd70a3'...
Processing dataset_id = '1a7f6728-5116-4f24-897a-59a7f322cfd2'...
Processing dataset_id = 'c37b388c-7107-43d6-bee6-4e82b40ed271'...
Processing dataset_id = 'bf6f1d78-6a0d-4afb-aea6-17a3c34340db'...
Processing dataset_id = 'a3becdde-018b-46f0-adea-d587076eef4a'...
Processing dataset_id = 'a9ad3a05-24fb-4e59-85b0-ee09e55a4492'...
Processing dataset_id = 'd56ae233-d6d2-483c-917e-1de0fe1cfeb7'...


## Pulling Subject Metadata for CCDG Afib

In [None]:
# Define parameters
billing_profile = "e0e03e48-5b96-45ec-baa4-8cc1ebf74c61"

# Define table of interest
table_name = "participant"

# List dataset to pull data frome
dataset_id_list = [
    'dd2cb8fc-42a6-482f-898e-ef6125feccb8',
    '51e9935f-ec18-4832-801a-6d9186537572',
    'e68d1d39-99df-4cd7-8053-1b298f03eabb',
    'a08dc7a6-f8ce-4205-95d2-83f614c2c32f',
    '36bdd59f-4f5b-43cd-8d34-a21ef87bbf30',
    'c4c49fcd-0c20-4cff-841a-cb58f5689c5b',
    'd6518df9-fc11-46ed-9c12-b9782d3829a0',
    '9ee2a552-89f8-4a48-9c94-9fa26ebb7483',
    '425412ba-894a-4824-acb8-bf18fe4576e0',
    'f22bd762-5c45-453e-bf22-b174514abb84',
    '0ee62643-b064-42f8-9b09-5d10eacd70a3',
    '1a7f6728-5116-4f24-897a-59a7f322cfd2',
    'c37b388c-7107-43d6-bee6-4e82b40ed271',
    'bf6f1d78-6a0d-4afb-aea6-17a3c34340db',
    'a3becdde-018b-46f0-adea-d587076eef4a',
    'a9ad3a05-24fb-4e59-85b0-ee09e55a4492',
    '719f7581-21db-4aec-8c46-4a5811832710',
    '318a75f4-ac50-4944-81b0-70a1323e7497',
    '75fb0984-2124-444f-881b-30a1a6f8b8f7',
    '15be288e-53e1-41cb-8d20-8ea87efb9258',
    '700303c2-fcef-48a5-9900-096bf34e2d83',
    '38fd20ce-affd-4791-9810-7f5a7fe876d0',
    'd56ae233-d6d2-483c-917e-1de0fe1cfeb7',
    'd7686f98-05a4-45c9-af2e-3ebc524a5b2d',
    '1939b7ae-fc6b-42a8-ad5f-dc51a1682a17',
    '8ccefc59-38a5-476f-b7d3-3f98315a97f0',
    '2cda53ba-b852-47e8-8f24-59ab8e9f1d1f',
    '6e67e1e1-5c39-43da-960f-48385789c4e1',
    '92382848-f5e9-426c-b7dc-f2841ae97018',
    '4999a410-990e-484b-b4f3-d636f894a741',
    '1f534eb4-701f-4182-9895-64c5e5b52d82',
    'd01a4268-1bfe-4a2d-a2d4-e296162c406e',
    'feca4815-b44b-4b2b-8d77-75edd62ba5a6',
    '039dd3d6-0cb5-4cd1-86b3-e9579c9b5218',
    'd0ce8b95-9c3b-4f9e-8ce0-169fd89a8b20',
    '7427b2eb-a84f-413c-bfb0-7d2e36b0628f',
    '9d796a02-e2aa-4c15-b8d6-1e90cd736681',
    '28c3df75-0b08-4d5c-9feb-6e2e918572ea',
    '433e3a09-661a-46a5-96f2-dbb07bdc87f3',
    'f69c21e9-cb5f-4e72-acfe-c54b672a9f3b',
    '2ef4530a-cc36-4f32-9a1a-63a555346587',
    'e917c83d-c482-442d-81ce-869de7d20903',
    '10774229-1487-4188-b2c3-1fabcf85492a',
    '65793118-3c88-4185-9172-2354850e6056',
    'fcb03f4f-e685-4803-aadb-0e8940ff4f37',
    '46536136-08e4-4521-8e6c-67f023de020d',
    'c2f0e7cf-ac07-48f7-b5f1-497ee6c134b2',
    'b8e7fe18-9c3d-4cc0-bbc7-85b27197fc8f',
    '3abfc362-7e73-4663-9dcf-07b78b9aa2d4',
    'b60b4737-c646-4299-85a0-520890e830b7',
]

# Establish API client
api_client = utils.refresh_tdr_api_client()
datasets_api = data_repo_client.DatasetsApi(api_client=api_client)

# Loop through enumerated datasets and create records for those related to AnVIL
print(f"Start time: {datetime.datetime.now()}")

df_results = pd.DataFrame()
for dataset_id in dataset_id_list:
    
    # Retrieve dataset details and pull source workspace(s)
    print(f"Processing dataset_id = '{dataset_id}'...")
    dataset_details = datasets_api.retrieve_dataset(id=dataset_id, include=["ACCESS_INFORMATION", "PROPERTIES"]).to_dict()
    try:
        source_workspace = ",".join(dataset_details["properties"]["source_workspaces"])
    except:
        source_workspace = ""
    
    # Pull data
    bq_project = dataset_details["access_information"]["big_query"]["project_id"]
    bq_schema = dataset_details["access_information"]["big_query"]["dataset_name"]
    client = bigquery.Client()
    query = f"""SELECT '{dataset_id}' AS dataset_id, '{source_workspace}' AS source_workspace, * FROM `{bq_project}.{bq_schema}.{table_name}`"""
    try:
        df_output = client.query(query).result().to_dataframe()
        df_results = pd.concat([df_results, df_output], ignore_index=True)
    except Exception as e:
        print(f"Error pulling data for dataset_id = '{dataset_id}'. Error: {str(e)}")
        no_tab_recs = {"dataset_id": [dataset_id], "source_workspace": [source_workspace], f"{table_name}_id": ["TABLE NOT FOUND"]}
        df_no_tab_recs = pd.DataFrame(data=no_tab_recs)
        df_results = pd.concat([df_results, df_no_tab_recs], ignore_index=True)
                                                                           
# Sort dataframe records and write out to file
df_sorted = df_results.sort_values(["source_workspace", "dataset_id", f"{table_name}_id"], ascending=[True, True, True], ignore_index=True)
output_file_path = f"ccdg_afib_{table_name}_metadata.tsv"
df_sorted.to_csv(output_file_path, index=False, sep="\t")
!gsutil cp $output_file_path $ws_bucket/ingest_pipeline/resources/misc/ 2> stdout
!rm $output_file_path
print(f"End time: {datetime.datetime.now()}")
print(f"Results copied to: {ws_bucket}/ingest_pipeline/resources/misc/{output_file_path}")