In [None]:
# !pip install --upgrade data_repo_client
# !pip install import_ipynb xmltodict

In [1]:
# Imports
import import_ipynb
import ingest_pipeline_utilities as utils
import build_mapping_query as bmq
import data_repo_client
from google.cloud import bigquery
from google.cloud import storage
import google.auth
import google.auth.transport.requests
import pandas as pd
import pandas_gbq
import datetime
import os
import re
import time
import requests
import logging
import json
ws_name = os.environ["WORKSPACE_NAME"]
ws_project = os.environ["WORKSPACE_NAMESPACE"]
ws_bucket = os.environ["WORKSPACE_BUCKET"]
ws_bucket_name = re.sub('^gs://', '', ws_bucket)

# Display options
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option("display.max_colwidth", None)
pd.set_option('display.width', 1000)
pd.set_option('display.colheader_justify', 'center')
pd.set_option('display.precision', 3)


Version 1.0.47: 1/27/2025 1:22pm - Nate Calvanese - Removed all references to anvil_tdr_ingest (including by adding ingest SA to auth domains directly).
Version 1.0.9: 2/25/2023 3:15pm - Nate Calvanese - Replaced FAPI with utils functions
Version 2.0.4: 10/18/2024 2:19pm - Nate Calvanese - Updated get_objects_list function to not use fuzzy matching for full file paths
Version: 1.0.10: 1/12/2024 11:25am - Nate Calvanese - Made max_combined_rec_ref_size configurable
Version 1.0.18: 10/23/2024 2:16pm - Nate Calvanese - Added support for queries without a source table (all hardcoded values)
Version 2.0.8: 9/20/2024 9:06pm -- Added high-priority flags in the object returned by the function
Version 1.0.4: 09/26/2024 9:28am - Nate Calvanese - Improved logic for handling part_of_dataset field
Version 1.0.4: 11/05/2024 8:11pm - Nate Calvanese - Enforced deduplication logic in queries
Version 1.0.2: 10/4/2023 10:40am - Nate Calvanese - Updated query logic and added validation


# TDR Reader Management

## Remove Undesired Readers from TDR Datasets

In [None]:
# Function to remove erroneous readers from snapshot
def clean_up_ad_readers(snapshot_id, readers):
    print("Cleaning up readers for {}...".format(snapshot_id))
    reader_list = readers
    api_client = utils.refresh_tdr_api_client()
    snapshots_api = data_repo_client.SnapshotsApi(api_client=api_client)
    
    # Retrieve snapshot, grab auth_domain
    if '$AUTH_DOMAIN' in reader_list:
        snapshot_response = snapshots_api.retrieve_snapshot(id=snapshot_id)
        snapshot_name = snapshot_response.name
        print("Snapshot name: {}".format(snapshot_name))
        try:
            auth_domain_list = snapshot_response.source[0].dataset_properties["auth_domains"]
        except:
            auth_domain_list = []
        for ad in auth_domain_list:
            reader_list.append(ad + "@firecloud.org")

    # Retrieve snapshot policies and delete readers that aren't in reader list
    snapshot_policy_response = snapshots_api.retrieve_snapshot_policies(id=snapshot_id)
    delete_count = 0
    for policy in snapshot_policy_response.policies:
        if policy.name == "reader":
            for policymember in policy.members:
                if policymember not in reader_list:
                    api_client = utils.refresh_tdr_api_client()
                    snapshots_api = data_repo_client.SnapshotsApi(api_client=api_client)
                    retry_count = 0
                    while retry_count < 1:
                        try:
                            delete_response = snapshots_api.delete_snapshot_policy_member(id=snapshot_id, policy_name="reader", member_email=policymember)
                            delete_count += 1
                            break
                        except:
                            retry_count += 1
                        
    # Print results
    snapshot_policy_response = snapshots_api.retrieve_snapshot_policies(id=snapshot_id)
    print(f"\t{delete_count} erroneous readers deleted.")
    
    for role in snapshot_policy_response.policies:
        if role.name == "reader":
            rem_readers = ", ".join(role.members)
            print(f"\tRemaining readers: {rem_readers}")
    return 

# Clean-up snapshots
reader_list = ["azul-anvil-prod@firecloud.org"]#, '$AUTH_DOMAIN']
snapshot_id_list = [
'b0fc6253-d274-4e53-9977-85d943116f7c',
]
for snapshot_id in snapshot_id_list:
    clean_up_ad_readers(snapshot_id, reader_list)


## Add Auth Domain Users to TDR Datasets

In [None]:
# Function to remove erroneous readers from snapshot
def restore_ad_readers(snapshot_id):
    print("Restoring AD readers for {}...".format(snapshot_id))
    api_client = utils.refresh_tdr_api_client()
    snapshots_api = data_repo_client.SnapshotsApi(api_client=api_client)
    
    # Retrieve snapshot, grab auth_domain
    reader_list = []
    snapshot_response = snapshots_api.retrieve_snapshot(id=snapshot_id)
    snapshot_name = snapshot_response.name
    print("Snapshot name: {}".format(snapshot_name))
    try:
        auth_domain_list = snapshot_response.source[0].dataset_properties["auth_domains"]
    except:
        auth_domain_list = []
    for ad in auth_domain_list:
        reader_list.append(ad + "@firecloud.org")
    
    # Add auth_domain groups as readers on the snapshot
    for ad in reader_list:
        add_response = snapshots_api.add_snapshot_policy_member(id=snapshot_id, policy_name="reader", policy_member={"email": ad})
                        
    # Print results
    snapshot_policy_response = snapshots_api.retrieve_snapshot_policies(id=snapshot_id)
    for role in snapshot_policy_response.policies:
        if role.name == "reader":
            rem_readers = ", ".join(role.members)
            print(f"\tCurrent readers: {rem_readers}")
    return 

# Clean-up snapshots
snapshot_id_list = [

]
for snapshot_id in snapshot_id_list:
    restore_ad_readers(snapshot_id)
    

# Collect AnVIL Snapshots and Datasets

In [None]:
# Dataset_ID Filter
dataset_id_list = [
    '032d39fb-d278-427d-b7d2-de648a25a20c',
    '119c7ceb-ad4e-4b6c-9f5f-edb08239aee7',
    '27acea14-41c9-4bf9-ad43-3ebb3ce90456',
    '2cace5dc-f660-45d4-b689-c4c89e77697c',
    '68ea655f-b4a3-43e2-95e4-f158ca2d67dd',
    '6fd0f009-3c34-4529-9a38-c59745545490',
    '841b7883-9447-4ea0-ae4a-84ea0240d919',
    '96461004-f4b3-4f82-a842-293b3ec46a60',
    '9f4ac69c-0919-4ac1-98a8-976ed79ace03',
    'a3ea4f97-6657-4d3c-9be6-96f097f5c952',
    'bef62e8a-5f5c-4e81-a8f8-ddeaf657b4e8',
    'd0fc3d6a-c3f4-4533-8a23-817a4e27f9be',
    'd2272f2d-c606-4027-b8ea-0bdd6d9d6535',
    'd596ee91-481c-4eb5-9a8a-88c1e10ba9b6',
    'ec6f49a2-176c-4564-82c5-e751baab46aa',
]

# Collect Anvil datasets and snapshots
current_datetime_string = datetime.datetime.now().strftime("%Y%m%d %H:%M:%S")
logging.info(f"Start time: {current_datetime_string}")
api_client = utils.refresh_tdr_api_client()
datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
snapshots_api = data_repo_client.SnapshotsApi(api_client=api_client)
datasets_list = datasets_api.enumerate_datasets(filter="anvil", limit=2000)
dataset_list_len = min(len(datasets_list.items), len(dataset_id_list))
records_list = []
dataset_count = 0
for dataset_entry in datasets_list.items:
    if len(dataset_id_list) == 0 or dataset_entry.id in dataset_id_list:
        dataset_count += 1
        logging.info(f"Processing dataset {dataset_count} of {dataset_list_len}")
        if re.match("^ANVIL_[a-zA-Z0-9-_]+_[0-9]{8}", dataset_entry.name.upper()):
            dataset_detail = datasets_api.retrieve_dataset(id=dataset_entry.id, include=["PROPERTIES", "DATA_PROJECT"])
            snapshots_list = snapshots_api.enumerate_snapshots(dataset_ids=[dataset_entry.id], limit=1000)
            try:
                source_workspace = ", ".join(dataset_detail.properties["source_workspaces"])
            except:
                source_workspace = ""
            if len(snapshots_list.items) == 0:
                record = [None, None, None, None, None, None, None, None, None, dataset_entry.id, dataset_entry.name, dataset_detail.ingest_service_account, dataset_entry.created_date[0:10], dataset_entry.cloud_platform, dataset_entry.secure_monitoring_enabled, source_workspace]
                records_list.append(record)
            else:
                snapshot_list_len = len(snapshots_list.items)
                snapshot_count = 0
                for snapshot_entry in snapshots_list.items:
                    snapshot_count += 1
                    logging.info(f"Processing snapshot {snapshot_count} of {snapshot_list_len} for dataset {dataset_count}")
                    # Get public policy information
                    creds, project = google.auth.default()
                    auth_req = google.auth.transport.requests.Request()
                    creds.refresh(auth_req)
                    public_flag = "N"
                    public_response = requests.get(
                        url=f"https://sam.dsde-prod.broadinstitute.org/api/resources/v2/datasnapshot/{snapshot_entry.id}/policies/reader/public",
                        headers={"Authorization": f"Bearer {creds.token}"},
                    )
                    if public_response.text == "true":
                        public_flag = "Y"
                    # Get snapshot DUOS ID and Lock status
                    api_client = utils.refresh_tdr_api_client()
                    snapshots_api = data_repo_client.SnapshotsApi(api_client=api_client)
                    snapshot_detail = snapshots_api.retrieve_snapshot(id=snapshot_entry.id, include=["DUOS"])
                    duos_id = ""
                    if snapshot_detail.duos_firecloud_group:
                        duos_id = snapshot_detail.duos_firecloud_group.duos_id
                    lock_name = snapshot_detail.resource_locks.exclusive
                    if lock_name:
                        lock_status = True
                    else:
                        lock_status = False
                    # Get snapshot readers and auth domain
                    snapshot_policy_response = snapshots_api.retrieve_snapshot_policies(id=snapshot_entry.id)
                    for role in snapshot_policy_response.policies:
                        if role.name == "reader":
                            readers = ", ".join(role.members)
                    ad_groups = ""
                    if snapshot_policy_response.auth_domain:
                        ad_groups = ", ".join(snapshot_policy_response.auth_domain)
                    record = [snapshot_entry.id, snapshot_entry.name, snapshot_entry.created_date[0:10], public_flag, readers, ad_groups, duos_id, snapshot_entry.data_project, lock_status, dataset_entry.id, dataset_entry.name, dataset_detail.ingest_service_account, dataset_entry.created_date[0:10], dataset_entry.cloud_platform, dataset_entry.secure_monitoring_enabled, source_workspace]
                    records_list.append(record)
df = pd.DataFrame(records_list, columns =["Snapshot ID", "Snapshot Name", "Snapshot Created Date", "Snapshot Public", "Snapshot Readers", "Snapshot Auth Domain", "Snapshot DUOS ID", "Snapshot Data Project", "Snapshot Locked", "Source Dataset ID", "Source Dataset Name", "Source Dataset SA", "Source Dataset Created Date", "Cloud Platform", "Secure Monitoring", "Source Workspace"])
df_sorted = df.sort_values(["Source Workspace", "Source Dataset Name", "Snapshot Name"], ascending=[True, True, True], ignore_index=True)
current_datetime_string = datetime.datetime.now().strftime("%Y%m%d %H:%M:%S")
logging.info(f"End time: {current_datetime_string}")
display(df_sorted)


# TDR Dataset and Snapshot Stats

## Release Snapshot Stats

In [None]:
#############################################
## Functions
#############################################

def return_snapshot_stats(release_tag, snapshot_id_list, mapped_column_list, target_bigquery_table):
    
    # Initialize variables
    agg_results = []
    
    # Parse mapped_field_list
    mapped_array_columns = {}
    mapped_non_array_columns = {}
    for mapped_column in mapped_column_list:
        tab_name = mapped_column["table"]
        col_name = mapped_column["column"]
        array_col = mapped_column["array"]
        if array_col == True:
            if mapped_array_columns.get(tab_name):
                curr_list = mapped_array_columns.get(tab_name)
                curr_list.append(col_name)
                mapped_array_columns[tab_name] = curr_list
            else:
                mapped_array_columns[tab_name] = [col_name]
        else:
            if mapped_non_array_columns.get(tab_name):
                curr_list = mapped_non_array_columns.get(tab_name)
                curr_list.append(col_name)
                mapped_non_array_columns[tab_name] = curr_list
            else:
                mapped_non_array_columns[tab_name] = [col_name]
    
    # Loop through and process datasets
    for snapshot_id in snapshot_id_list:
        
        # Initialize variables
        results = []
    
        # Grab access information from schema
        logging.info(f"Collecting stats for snapshot_id {snapshot_id}...")
        api_client = utils.refresh_tdr_api_client()
        snapshots_api = data_repo_client.SnapshotsApi(api_client=api_client)
        try:
            response = snapshots_api.retrieve_snapshot(id=snapshot_id, include=["TABLES", "ACCESS_INFORMATION"]).to_dict()
            snapshot_name = response["name"]
            base_dataset_name = re.search(r"^(.*)_[0-9]{8}_ANV5_[0-9]{12}$", snapshot_name)[1]
            tdr_schema_dict = {}
            tdr_schema_dict["tables"] = response["tables"]
            bq_project = response["access_information"]["big_query"]["project_id"]
            bq_dataset = response["access_information"]["big_query"]["dataset_name"]
        except Exception as e:
            err_msg = f"Error retrieving snapshot details: {str(e)}"
            logging.error(err_msg)
            results.append([release_tag, snapshot_id, "", "All", "All", "All", "All", "All", "All", 0, 0, 0, "Error", err_msg])

        if not results:
            # Clear records from target BQ table
            delete_query = f"""DELETE FROM `{target_bigquery_table}` WHERE release_tag = '{release_tag}' AND snapshot_id = '{snapshot_id}'"""
            try:
                client = bigquery.Client()
                delete_query_job = client.query(delete_query)
                delete_query_job.result()
            except Exception as e:
                logging.info("Error deleting records for the snapshots for the specified release from the target BQ table.")

            # Pull table stats
            for table in tdr_schema_dict["tables"]:
                table_name = table["name"]
                column_count = len(table["columns"])
                fileref_count = 0
                record_count = 0
                total_bytes = 0
                supp_file_count = 0
                for column in table["columns"]:
                    if column["datatype"] == "fileref":
                        fileref_count += 1
                if "anvil_" in table_name:
                    table_type = "harmonized"
                else:
                    table_type = "source"

                # Build and execute table metrics query
                if table_name == "anvil_file":
                    query = "SELECT COUNT(*) AS row_count, SUM(file_size) AS total_bytes, SUM(CASE WHEN is_supplementary = True THEN 1 ELSE 0 END) AS supp_file_count FROM `{project}.{dataset}.anvil_file` ".format(project=bq_project, dataset=bq_dataset, table=table_name)
                else:
                    query = "SELECT COUNT(*) AS row_count, 0 AS total_bytes, 0 AS supp_file_count FROM `{project}.{dataset}.{table}` ".format(project=bq_project, dataset=bq_dataset, table=table_name)  
                try:
                    client = bigquery.Client()
                    df_results = client.query(query).result().to_dataframe()
                    record_count = df_results["row_count"].values[0]
                    total_bytes = df_results["total_bytes"].values[0]
                    supp_file_count = df_results["supp_file_count"].values[0]
                    if record_count > 0:
                        supp_file_perc = round(supp_file_count / record_count, 2)
                    else:
                        supp_file_perc = 0
                    results.append([release_tag, snapshot_id, base_dataset_name, table_type, table_name, "All", "All", "Table Metrics", "Count of Columns", column_count, 0, 0, "Success", ""])
                    results.append([release_tag, snapshot_id, base_dataset_name, table_type, table_name, "All", "All", "Table Metrics", "Count of Fileref Columns", fileref_count, 0, 0, "Success", ""])
                    results.append([release_tag, snapshot_id, base_dataset_name, table_type, table_name, "All", "All", "Table Metrics", "Count of Records", record_count, 0, 0, "Success", ""])
                    if table_name == "anvil_file":
                        results.append([release_tag, snapshot_id, base_dataset_name, table_type, table_name, "All", "All", "Table Metrics", "Sum of File Bytes", total_bytes, 0, 0, "Success", ""])
                        results.append([release_tag, snapshot_id, base_dataset_name, table_type, table_name, "All", "All", "Table Metrics", "Count of Supplementary Files", supp_file_count, record_count, supp_file_perc, "Success", ""])
                except Exception as e:
                    err_msg = f"Error retrieving data from BigQuery: {str(e)}"
                    results.append([release_tag, snapshot_id, base_dataset_name, table_type, table_name, "All", "All", "Table Metrics", "Count of Columns", 0, 0, 0, "Error", err_msg])
                    results.append([release_tag, snapshot_id, base_dataset_name, table_type, table_name, "All", "All", "Table Metrics", "Count of Fileref Columns", 0, 0, 0, "Error", err_msg])
                    results.append([release_tag, snapshot_id, base_dataset_name, table_type, table_name, "All", "All", "Table Metrics", "Count of Records", 0, 0, 0, "Error", err_msg])
                    if table_name == "anvil_file":
                        results.append([release_tag, snapshot_id, base_dataset_name, table_type, table_name, "All", "All", "Table Metrics", "Sum of File Bytes", 0, 0, 0, "Error", err_msg])
                        results.append([release_tag, snapshot_id, base_dataset_name, table_type, table_name, "All", "All", "Table Metrics", "Count of Supplementary Files", 0, 0, 0, "Error", err_msg])  

                # Build and execute null column count query
                if record_count > 0:
                    null_query = """WITH null_counts AS
                            (
                              SELECT column_name, COUNT(1) AS cnt
                              FROM `{project}.{dataset}.{table}`, 
                              UNNEST(REGEXP_EXTRACT_ALL(TO_JSON_STRING(`{project}.{dataset}.{table}`), r'"(\w+)":(?:null|\[\])')) column_name
                              GROUP BY column_name
                            ),
                            table_count AS
                            (
                              SELECT COUNT(*) AS cnt FROM `{project}.{dataset}.{table}`
                            )
                            SELECT 'Summary Stats' AS metric_type, src.table_name AS source_table, src.column_name AS source_column, 
                            'Count of nulls or empty lists in column' AS metric,
                            COALESCE(tar.cnt, 0) AS n, 
                            table_count.cnt AS d,
                            ROUND(CASE WHEN table_count.cnt > 0 THEN COALESCE(tar.cnt, 0)/table_count.cnt END, 2) AS r,
                            null AS flag
                            FROM `{project}.{dataset}.INFORMATION_SCHEMA.COLUMNS` src
                              LEFT JOIN null_counts tar ON src.column_name = tar.column_name
                              CROSS JOIN table_count
                            WHERE src.table_name = '{table}'
                            AND src.column_name NOT IN ('datarepo_row_id', 'datarepo_ingest_date')""".format(project = bq_project, dataset=bq_dataset, table=table_name)
                    try:
                        df_result = client.query(null_query).result().to_dataframe()
                        for index, row in df_result.iterrows():
                            results.append([release_tag, snapshot_id, base_dataset_name, table_type, table_name, row["source_column"], "All", "Column Metrics", row["metric"], row["n"], row["d"], row["r"], "Success", ""])
                    except Exception as e:
                        err_msg = f"Error retrieving data from BigQuery: {str(e)}"
                        results.append([release_tag, snapshot_id, base_dataset_name, table_type, table_name, "All", "All", "Column Metrics", "Count of nulls or empty lists in column", 0, 0, 0, "Error", err_msg])

                    # Build and execute distinct column value query
                    distinct_query = """WITH distinct_counts AS
                            (
                              SELECT column_name, APPROX_COUNT_DISTINCT(CASE WHEN column_value NOT IN ('null', '[]') THEN column_value END) AS cnt
                              FROM `{project}.{dataset}.{table}`,
                              UNNEST(REGEXP_EXTRACT_ALL(TO_JSON_STRING(`{project}.{dataset}.{table}`), r'"(\w+)":')) AS column_name WITH OFFSET pos1,
                              UNNEST(REGEXP_EXTRACT_ALL(TO_JSON_STRING(`{project}.{dataset}.{table}`), r':(.+?),')) AS column_value WITH OFFSET pos2
                              WHERE pos1 = pos2
                              GROUP BY column_name
                            ),
                            table_count AS
                            (
                              SELECT COUNT(*) AS cnt FROM `{project}.{dataset}.{table}`
                            )
                            SELECT 'Summary Stats' AS metric_type, src.table_name AS source_table, src.column_name AS source_column, 
                            'Count of distinct values in column' AS metric,
                            COALESCE(tar.cnt, 0) AS n, 
                            table_count.cnt AS d,
                            ROUND(CASE WHEN table_count.cnt > 0 THEN COALESCE(tar.cnt, 0)/table_count.cnt END, 2) AS r,
                            null AS flag
                            FROM `{project}.{dataset}.INFORMATION_SCHEMA.COLUMNS` src
                              LEFT JOIN distinct_counts tar ON src.column_name = tar.column_name
                              CROSS JOIN table_count
                            WHERE src.table_name = '{table}'
                            AND src.column_name NOT IN ('datarepo_row_id', 'datarepo_ingest_date')""".format(project = bq_project, dataset = bq_dataset, table = table_name)
                    try:
                        df_result = client.query(distinct_query).result().to_dataframe()
                        for index, row in df_result.iterrows():
                            results.append([release_tag, snapshot_id, base_dataset_name, table_type, table_name, row["source_column"], "All", "Column Metrics", row["metric"], row["n"], row["d"], row["r"], "Success", ""])
                    except Exception as e:
                        err_msg = f"Error retrieving data from BigQuery: {str(e)}"
                        results.append([release_tag, snapshot_id, base_dataset_name, table_type, table_name, "All", "All", "Column Metrics", "Count of distinct values in column", 0, 0, 0, "Error", err_msg])
                        
                    # Build and executed mapped values query (non-array fields)
                    if mapped_non_array_columns.get(table_name):
                        for column_name in mapped_non_array_columns[table_name]:
                            mapped_value_query = f"""WITH table_count AS
                                                     (
                                                         SELECT COUNT(*) AS cnt FROM `{bq_project}.{bq_dataset}.{table_name}`
                                                     )   
                                                     SELECT '{column_name}' AS column_name, {column_name} AS column_value, COUNT(*) AS n,
                                                     MAX(table_count.cnt) AS d,
                                                     ROUND(CASE WHEN MAX(table_count.cnt) > 0 THEN COUNT(*)/MAX(table_count.cnt) END, 2) AS r
                                                     FROM `{bq_project}.{bq_dataset}.{table_name}` 
                                                         CROSS JOIN table_count
                                                     WHERE {column_name} IS NOT NULL
                                                     GROUP BY {column_name}"""
                            try:
                                df_result = client.query(mapped_value_query).result().to_dataframe()
                                for index, row in df_result.iterrows():
                                    results.append([release_tag, snapshot_id, base_dataset_name, table_type, table_name, row["column_name"], row["column_value"], "Column Value Metrics", "Count of records with value", row["n"], row["d"], row["r"], "Success", ""])
                            except Exception as e:
                                err_msg = f"Error retrieving data from BigQuery: {str(e)}"
                                results.append([release_tag, snapshot_id, base_dataset_name, table_type, table_name, "All", "All", "Column Value Metrics", "Count of records with value", 0, 0, 0, "Error", err_msg])
                    
                    # Build and execute mapped values query (array fields)
                    if mapped_array_columns.get(table_name):
                        for column_name in mapped_array_columns[table_name]:
                            mapped_value_query = f"""WITH table_count AS
                                                     (
                                                         SELECT COUNT(*) AS cnt FROM `{bq_project}.{bq_dataset}.{table_name}`
                                                     )   
                                                     SELECT '{column_name}' AS column_name, {column_name}_unnested AS column_value, COUNT(*) AS n,
                                                     MAX(table_count.cnt) AS d,
                                                     ROUND(CASE WHEN MAX(table_count.cnt) > 0 THEN COUNT(*)/MAX(table_count.cnt) END, 2) AS r
                                                     FROM `{bq_project}.{bq_dataset}.{table_name}` 
                                                         CROSS JOIN UNNEST({column_name}) AS {column_name}_unnested
                                                         CROSS JOIN table_count
                                                     WHERE {column_name}_unnested IS NOT NULL
                                                     GROUP BY {column_name}_unnested"""
                            try:
                                df_result = client.query(mapped_value_query).result().to_dataframe()
                                for index, row in df_result.iterrows():
                                    results.append([release_tag, snapshot_id, base_dataset_name, table_type, table_name, row["column_name"], row["column_value"], "Column Value Metrics", "Count of records with value", row["n"], row["d"], row["r"], "Success", ""])
                            except Exception as e:
                                err_msg = f"Error retrieving data from BigQuery: {str(e)}"
                                results.append([release_tag, snapshot_id, base_dataset_name, table_type, table_name, "All", "All", "Column Value Metrics", "Count of records with value", 0, 0, 0, "Error", err_msg])
            
                else:
                    results.append([release_tag, snapshot_id, base_dataset_name, table_type, table_name, "All", "All", "Column Metrics", "Count of nulls or empty lists in column", 0, 0, 0, "Success", "No records in table"])
                    results.append([release_tag, snapshot_id, base_dataset_name, table_type, table_name, "All", "All", "Column Metrics", "Count of distinct values in column", 0, 0, 0, "Success", "No records in table"])
    
        # Convert results for dataset
        results_df = pd.DataFrame(results, columns = ["release_tag", "snapshot_id", "base_dataset", "table_type", "table_name", "column_name", "column_value", "metric_type", "metric", "numerator", "denominator", "result", "status", "message"])
        results_df.sort_values(by=["base_dataset", "table_type", "table_name", "column_name", "column_value", "metric_type", "metric"], inplace=True, ignore_index = True)
        display(results_df)
        
        # Write out results
        client = bigquery.Client()
        job_config = bigquery.LoadJobConfig(
            schema=[
                bigquery.SchemaField("release_tag", bigquery.enums.SqlTypeNames.STRING),
                bigquery.SchemaField("snapshot_id", bigquery.enums.SqlTypeNames.STRING),
                bigquery.SchemaField("base_dataset", bigquery.enums.SqlTypeNames.STRING),
                bigquery.SchemaField("table_type", bigquery.enums.SqlTypeNames.STRING),
                bigquery.SchemaField("table_name", bigquery.enums.SqlTypeNames.STRING),
                bigquery.SchemaField("column_name", bigquery.enums.SqlTypeNames.STRING),
                bigquery.SchemaField("column_value", bigquery.enums.SqlTypeNames.STRING),
                bigquery.SchemaField("metric_type", bigquery.enums.SqlTypeNames.STRING),
                bigquery.SchemaField("metric", bigquery.enums.SqlTypeNames.STRING),
                bigquery.SchemaField("numerator", bigquery.enums.SqlTypeNames.INTEGER),
                bigquery.SchemaField("denominator", bigquery.enums.SqlTypeNames.INTEGER),
                bigquery.SchemaField("result", bigquery.enums.SqlTypeNames.FLOAT64),
                bigquery.SchemaField("status", bigquery.enums.SqlTypeNames.STRING),
                bigquery.SchemaField("message", bigquery.enums.SqlTypeNames.STRING),
            ],
            write_disposition="WRITE_APPEND"
        )
        job = client.load_table_from_dataframe(results_df, target_bigquery_table, job_config=job_config)
        job.result()
    
    # Send completion notice
    logging.info(f"Processing complete.")  

#############################################
## Input Parameters
#############################################

# Release tag
release_tag = "anvil8"

# List of snapshots to pull stats for
snapshot_id_list = [
    '01cf2450-604b-43e5-9f4e-9ec4e0bf0a61',
]

# List of mapped columns
mapped_column_list = [
    {"table": "anvil_biosample", "column": "biosample_type", "array": False},
    {"table": "anvil_biosample", "column": "anatomical_site", "array": False},
    {"table": "anvil_donor", "column": "organism_type", "array": False},
    {"table": "anvil_donor", "column": "phenotypic_sex", "array": False},
    {"table": "anvil_file", "column": "file_format", "array": False},
    {"table": "anvil_dataset", "column": "consent_group", "array": True},
    {"table": "anvil_dataset", "column": "data_modality", "array": True},
    {"table": "anvil_dataset", "column": "registered_identifier", "array": True},
    {"table": "anvil_diagnosis", "column": "disease", "array": True},
    {"table": "anvil_donor", "column": "reported_ethnicity", "array": True},
]

# Target bigquery table
target_bigquery_table = "broad-dsde-prod-analytics-dev.anvil_inventory.release_snapshot_stats"

#############################################
## Execution
#############################################

return_snapshot_stats(release_tag, snapshot_id_list, mapped_column_list, target_bigquery_table)


## Dataset Stats -- Table Count, Row Count, File Size, etc.

In [None]:
#############################################
## Functions
#############################################

def return_dataset_stats(dataset_id_list, write_out_results, target_bigquery_table, display_results):
    
    # Initialize variables
    agg_results = []
    
    # Loop through and process datasets
    for dataset_id in dataset_id_list:
        
        # Initialize variables
        results = []
    
        # Grab access information from schema
        logging.info(f"Collecting stats for dataset_id {dataset_id}...")
        api_client = utils.refresh_tdr_api_client()
        datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
        try:
            response = datasets_api.retrieve_dataset(id=dataset_id, include=["SCHEMA", "ACCESS_INFORMATION", "PROPERTIES"]).to_dict()
            dataset_name = response["name"]
            tdr_schema_dict = {}
            tdr_schema_dict["tables"] = response["schema"]["tables"]
            bq_project = response["access_information"]["big_query"]["project_id"]
            bq_dataset = response["access_information"]["big_query"]["dataset_name"]
            try:
                source_workspace = response["properties"]["source_workspaces"][0]
            except:
                source_workspace = ""
        except Exception as e:
            err_msg = f"Error retrieving dataset details: {str(e)}"
            logging.error(err_msg)
            results.append([dataset_id, "", "All", 0, 0, 0, 0, 0, "Error", err_msg])

        # Clear records from target BQ table
        logging.info(f"Preparing target BQ table.")
        delete_query = f"""DELETE FROM `{target_bigquery_table}` WHERE dataset_id = '{dataset_id}'"""
        try:
            client = bigquery.Client()
            delete_query_job = client.query(delete_query)
            delete_query_job.result()
        except Exception as e:
            logging.info("Error deleting records for the dataset from the target BQ table.")
        
        # Pull table stats
        logging.info(f"Running data profiling queries.")
        for table in tdr_schema_dict["tables"]:
            table_name = table["name"]
            column_count = len(table["columns"])
            fileref_count = 0
            record_count = 0
            total_bytes = 0
            supp_file_count = 0
            for column in table["columns"]:
                if column["datatype"] == "fileref":
                    fileref_count += 1
            if "anvil_" in table_name:
                table_type = "fss"
            else:
                table_type = "source"
            
            # Build and execute table metrics query
            if table_name == "anvil_file":
                query = "SELECT COUNT(*) AS row_count, SUM(file_size) AS total_bytes, SUM(CASE WHEN is_supplementary = True THEN 1 ELSE 0 END) AS supp_file_count FROM `{project}.{dataset}.anvil_file` ".format(project=bq_project, dataset=bq_dataset, table=table_name)
            else:
                query = "SELECT COUNT(*) AS row_count, 0 AS total_bytes, 0 AS supp_file_count FROM `{project}.{dataset}.{table}` ".format(project=bq_project, dataset=bq_dataset, table=table_name)  
            try:
                client = bigquery.Client()
                df_results = client.query(query).result().to_dataframe()
                record_count = df_results["row_count"].values[0]
                total_bytes = df_results["total_bytes"].values[0]
                supp_file_count = df_results["supp_file_count"].values[0]
                results.append([dataset_id, dataset_name, source_workspace, table_type, table_name, "All", "Table Metrics", "Count of Columns", column_count, 0, 0, "Success", ""])
                results.append([dataset_id, dataset_name, source_workspace, table_type, table_name, "All", "Table Metrics", "Count of Fileref Columns", fileref_count, 0, 0, "Success", ""])
                results.append([dataset_id, dataset_name, source_workspace, table_type, table_name, "All", "Table Metrics", "Count of Records", record_count, 0, 0, "Success", ""])
                if table_name == "anvil_file":
                    results.append([dataset_id, dataset_name, source_workspace, table_type, table_name, "All", "Table Metrics", "Sum of File Bytes (anvil_file)", total_bytes, 0, 0, "Success", ""])
                    results.append([dataset_id, dataset_name, source_workspace, table_type, table_name, "All", "Table Metrics", "Count of Supplementary Files", supp_file_count, 0, 0, "Success", ""])
            except Exception as e:
                err_msg = f"Error retrieving data from BigQuery: {str(e)}"
                results.append([dataset_id, dataset_name, source_workspace, table_type, table_name, "All", "Table Metrics", "Count of Columns", 0, 0, 0, "Error", err_msg])
                results.append([dataset_id, dataset_name, source_workspace, table_type, table_name, "All", "Table Metrics", "Count of Fileref Columns", 0, 0, 0, "Error", err_msg])
                results.append([dataset_id, dataset_name, source_workspace, table_type, table_name, "All", "Table Metrics", "Count of Records", 0, 0, 0, "Error", err_msg])
                if table_name == "anvil_file":
                    results.append([dataset_id, dataset_name, source_workspace, table_type, table_name, "All", "Table Metrics", "Sum of File Bytes (anvil_file)", 0, 0, 0, "Error", err_msg])
                    results.append([dataset_id, dataset_name, source_workspace, table_type, table_name, "All", "Table Metrics", "Count of Supplementary Files", 0, 0, 0, "Error", err_msg])  
            
            # Build and execute null column count query
            if record_count > 0:
                null_query = """WITH null_counts AS
                        (
                          SELECT column_name, COUNT(1) AS cnt
                          FROM `{project}.{dataset}.{table}`, 
                          UNNEST(REGEXP_EXTRACT_ALL(TO_JSON_STRING(`{project}.{dataset}.{table}`), r'"(\w+)":(?:null|\[\])')) column_name
                          GROUP BY column_name
                        ),
                        table_count AS
                        (
                          SELECT COUNT(*) AS cnt FROM `{project}.{dataset}.{table}`
                        )
                        SELECT 'Summary Stats' AS metric_type, src.table_name AS source_table, src.column_name AS source_column, 
                        'Count of nulls or empty lists in column' AS metric,
                        COALESCE(tar.cnt, 0) AS n, 
                        table_count.cnt AS d,
                        ROUND(CASE WHEN table_count.cnt > 0 THEN COALESCE(tar.cnt, 0)/table_count.cnt END, 2) AS r,
                        null AS flag
                        FROM `{project}.{dataset}.INFORMATION_SCHEMA.COLUMNS` src
                          LEFT JOIN null_counts tar ON src.column_name = tar.column_name
                          CROSS JOIN table_count
                        WHERE src.table_name = '{table}'
                        AND src.column_name NOT IN ('datarepo_row_id', 'datarepo_ingest_date')""".format(project = bq_project, dataset=bq_dataset, table=table_name)
                try:
                    df_result = client.query(null_query).result().to_dataframe()
                    for index, row in df_result.iterrows():
                        results.append([dataset_id, dataset_name, source_workspace, table_type, table_name, row["source_column"], "Column Metrics", row["metric"], row["n"], row["d"], row["r"], "Success", ""])
                except Exception as e:
                    err_msg = f"Error retrieving data from BigQuery: {str(e)}"
                    results.append([dataset_id, dataset_name, source_workspace, table_type, table_name, "All", "Column Metrics", "Count of nulls or empty lists in column", 0, 0, 0, "Error", err_msg])

                # Build and execute distinct column value query
                distinct_query = """WITH distinct_counts AS
                        (
                          SELECT column_name, APPROX_COUNT_DISTINCT(CASE WHEN column_value NOT IN ('null', '[]') THEN column_value END) AS cnt
                          FROM `{project}.{dataset}.{table}`,
                          UNNEST(REGEXP_EXTRACT_ALL(TO_JSON_STRING(`{project}.{dataset}.{table}`), r'"(\w+)":')) AS column_name WITH OFFSET pos1,
                          UNNEST(REGEXP_EXTRACT_ALL(TO_JSON_STRING(`{project}.{dataset}.{table}`), r':(.+?),')) AS column_value WITH OFFSET pos2
                          WHERE pos1 = pos2
                          GROUP BY column_name
                        ),
                        table_count AS
                        (
                          SELECT COUNT(*) AS cnt FROM `{project}.{dataset}.{table}`
                        )
                        SELECT 'Summary Stats' AS metric_type, src.table_name AS source_table, src.column_name AS source_column, 
                        'Count of distinct values in column' AS metric,
                        COALESCE(tar.cnt, 0) AS n, 
                        table_count.cnt AS d,
                        ROUND(CASE WHEN table_count.cnt > 0 THEN COALESCE(tar.cnt, 0)/table_count.cnt END, 2) AS r,
                        null AS flag
                        FROM `{project}.{dataset}.INFORMATION_SCHEMA.COLUMNS` src
                          LEFT JOIN distinct_counts tar ON src.column_name = tar.column_name
                          CROSS JOIN table_count
                        WHERE src.table_name = '{table}'
                        AND src.column_name NOT IN ('datarepo_row_id', 'datarepo_ingest_date')""".format(project = bq_project, dataset = bq_dataset, table = table_name)
                try:
                    df_result = client.query(distinct_query).result().to_dataframe()
                    for index, row in df_result.iterrows():
                        results.append([dataset_id, dataset_name, source_workspace, table_type, table_name, row["source_column"], "Column Metrics", row["metric"], row["n"], row["d"], row["r"], "Success", ""])
                except Exception as e:
                    err_msg = f"Error retrieving data from BigQuery: {str(e)}"
                    results.append([dataset_id, dataset_name, source_workspace, table_type, table_name, "All", "Column Metrics", "Count of distinct values in column", 0, 0, 0, "Error", err_msg])
            else:
                results.append([dataset_id, dataset_name, source_workspace, table_type, table_name, "All", "Column Metrics", "Count of nulls or empty lists in column", 0, 0, 0, "Success", "No records in table"])
                results.append([dataset_id, dataset_name, source_workspace, table_type, table_name, "All", "Column Metrics", "Count of distinct values in column", 0, 0, 0, "Success", "No records in table"])
    
        # Convert results for dataset
        results_df = pd.DataFrame(results, columns = ["dataset_id", "dataset_name", "orig_workspace", "table_type", "table_name", "column_name", "metric_type", "metric", "numerator", "denominator", "result", "status", "message"])
        results_df.sort_values(by=["dataset_name", "table_type", "table_name", "column_name", "metric_type", "metric"], inplace=True, ignore_index = True)

        # Write out results, if specified
        if write_out_results:
            client = bigquery.Client()
            job_config = bigquery.LoadJobConfig(
                schema=[
                    bigquery.SchemaField("dataset_id", bigquery.enums.SqlTypeNames.STRING),
                    bigquery.SchemaField("dataset_name", bigquery.enums.SqlTypeNames.STRING),
                    bigquery.SchemaField("orig_workspace", bigquery.enums.SqlTypeNames.STRING),
                    bigquery.SchemaField("table_type", bigquery.enums.SqlTypeNames.STRING),
                    bigquery.SchemaField("table_name", bigquery.enums.SqlTypeNames.STRING),
                    bigquery.SchemaField("column_name", bigquery.enums.SqlTypeNames.STRING),
                    bigquery.SchemaField("metric_type", bigquery.enums.SqlTypeNames.STRING),
                    bigquery.SchemaField("metric", bigquery.enums.SqlTypeNames.STRING),
                    bigquery.SchemaField("numerator", bigquery.enums.SqlTypeNames.INTEGER),
                    bigquery.SchemaField("denominator", bigquery.enums.SqlTypeNames.INTEGER),
                    bigquery.SchemaField("result", bigquery.enums.SqlTypeNames.FLOAT64),
                    bigquery.SchemaField("status", bigquery.enums.SqlTypeNames.STRING),
                    bigquery.SchemaField("message", bigquery.enums.SqlTypeNames.STRING),
                ],
                write_disposition="WRITE_APPEND"
            )
            job = client.load_table_from_dataframe(results_df, target_bigquery_table, job_config=job_config)
            job.result()
        
        # Add dataset results to aggregated results
        agg_results.extend(results)
    
    # Display results
    if display_results:
        print("\nFinal Results:")
        agg_results_df = pd.DataFrame(agg_results, columns = ["dataset_id", "dataset_name", "orig_workspace", "table_type", "table_name", "column_name", "metric_type", "metric", "numerator", "denominator", "result", "status", "message"])
        agg_results_df.sort_values(by=["dataset_name", "table_type", "table_name", "column_name", "metric_type", "metric"], inplace=True, ignore_index = True)
        display(agg_results_df)
    else:
        logging.info(f"Processing complete.")
        

#############################################
## Input Parameters
#############################################

# List of datasets to pull stats for
dataset_id_list = [
    '2a263db0-8c33-4171-840f-54bf4755a4b9',
]

# Variable to write results out to a file
write_out_results = True
target_bigquery_table = "broad-dsde-prod-analytics-dev.anvil_inventory.table_and_column_stats"
display_results = True

#############################################
## Execution
#############################################

return_dataset_stats(dataset_id_list, write_out_results, target_bigquery_table, display_results)


## Snapshot Row Count Collection 

In [None]:
def return_row_counts(snapshot_id, results_list):
    # Grab access information from schema
    api_client = utils.refresh_tdr_api_client()
    snapshots_api = data_repo_client.SnapshotsApi(api_client=api_client)
    try:
        response = snapshots_api.retrieve_snapshot(id=snapshot_id, include=["TABLES", "ACCESS_INFORMATION"]).to_dict()
        tdr_schema_dict = {}
        tdr_schema_dict["tables"] = response["tables"]
        bq_project = response["access_information"]["big_query"]["project_id"]
        bq_dataset = response["access_information"]["big_query"]["dataset_name"]
    except:
        results_list.append([snapshot_id, 0])
        return results_list
    
    # Build row count query
    table_set = set()
    table_count = 0
    row_count_subquery = ""
    for table_entry in tdr_schema_dict["tables"]:
        table_set.add(table_entry["name"])
    for table_entry in table_set:
        table_count += 1
        if table_count == 1:
            row_count_subquery += "SELECT datarepo_row_id FROM `{project}.{dataset}.{table}` ".format(project=bq_project, dataset=bq_dataset, table=table_entry)
        else:
            row_count_subquery += "UNION ALL SELECT datarepo_row_id FROM `{project}.{dataset}.{table}` ".format(project=bq_project, dataset=bq_dataset, table=table_entry)
    row_count_query = "SELECT COUNT(*) AS row_count FROM ({subquery})".format(subquery=row_count_subquery)
    
    # Execute query and write results to results dict
    try:
        client = bigquery.Client()
        df_results = client.query(row_count_query).result().to_dataframe()
        row_count = df_results["row_count"].values[0]
        results_list.append([snapshot_id, row_count])
    except:
        results_list.append([snapshot_id, 0])
    return results_list
    
# Loop through snapshots and collect row counts
results_list = []
snapshot_id_list = [
'bb7eaad8-b02c-455c-964d-c9242019d9e5',
]
for snapshot_id in snapshot_id_list:
    results_list = return_row_counts(snapshot_id, results_list)
    
# Convert results to dataframe and display
results_df = pd.DataFrame(results_list, columns = ["snapshot_id", "row_count"])
display(results_df)


# Pulling Dataset Sizes Across AnVIL

## Pulling file counts and sizes from TDR (Total)

In [None]:
# Define parameters
billing_profile = "e0e03e48-5b96-45ec-baa4-8cc1ebf74c61"

# Establish API client
creds, project = google.auth.default()
auth_req = google.auth.transport.requests.Request()
creds.refresh(auth_req)
config = data_repo_client.Configuration()
config.host = "https://data.terra.bio"
config.access_token = creds.token
api_client = data_repo_client.ApiClient(configuration=config)
api_client.client_side_validation = False
datasets_api = data_repo_client.DatasetsApi(api_client=api_client)

# Loop through enumerated datasets and create records for those related to AnVIL
print(f"Start time: {datetime.datetime.now()}")
records_list = []
datasets_list = datasets_api.enumerate_datasets(limit=2000)
for dataset_entry in datasets_list.items:
    if dataset_entry.default_profile_id == billing_profile:
        # Retrieve dataset details and pull source workspace(s)
        dataset_details = datasets_api.retrieve_dataset(id=dataset_entry.id, include=["ACCESS_INFORMATION", "PROPERTIES"]).to_dict()
        try:
            source_workspace = ",".join(dataset_details["properties"]["source_workspaces"])
        except:
            source_workspace = ""
        
        # Pull data file size sum from BigQuery
        bq_project = dataset_details["access_information"]["big_query"]["project_id"]
        bq_schema = dataset_details["access_information"]["big_query"]["dataset_name"]
        client = bigquery.Client()
        file_size_query = """SELECT COUNT(*) AS file_count, COALESCE(SUM(size_in_bytes),0) AS file_size FROM `{project}.{schema}.file_inventory`""".format(project = bq_project, schema = bq_schema)
        try:
            df_output = client.query(file_size_query).result().to_dataframe()
            file_count = df_output["file_count"].values[0]
            byte_size = df_output["file_size"].values[0]
            status = "Success"
        except:
            file_count = 0
            byte_size = 0
            status = "Error"
    
        # Build record for dataset
        record = [dataset_entry.id, dataset_entry.name, source_workspace, file_count, byte_size, status]
        records_list.append(record)
        
# Read records into a dataframe
df = pd.DataFrame(records_list, columns =["Dataset UUID", "Dataset Name", "Source Workspaces", "File Count", "File Size (Bytes)", "Retrieval Status"])
df["File Size (Bytes)"] = df["File Size (Bytes)"].astype(int).astype(str)
df_sorted = df.sort_values(["Source Workspaces", "File Size (Bytes)"], ascending=[True, False], ignore_index=True)
print(f"End time: {datetime.datetime.now()}")
display(df_sorted)

## Pulling file counts and sizes from TDR (By Source Bucket)

In [None]:
# Define parameters
billing_profile = "e0e03e48-5b96-45ec-baa4-8cc1ebf74c61"

# Establish API client
creds, project = google.auth.default()
auth_req = google.auth.transport.requests.Request()
creds.refresh(auth_req)
config = data_repo_client.Configuration()
config.host = "https://data.terra.bio"
config.access_token = creds.token
api_client = data_repo_client.ApiClient(configuration=config)
api_client.client_side_validation = False
datasets_api = data_repo_client.DatasetsApi(api_client=api_client)

# Loop through enumerated datasets and create records for those related to AnVIL
print(f"Start time: {datetime.datetime.now()}")
records_list = []
datasets_list = datasets_api.enumerate_datasets(limit=2000)
for dataset_entry in datasets_list.items:
    if dataset_entry.default_profile_id == billing_profile:
        # Retrieve dataset details and pull source workspace(s)
        dataset_details = datasets_api.retrieve_dataset(id=dataset_entry.id, include=["ACCESS_INFORMATION", "PROPERTIES"]).to_dict()
        try:
            source_workspace = ",".join(dataset_details["properties"]["source_workspaces"])
        except:
            source_workspace = ""
        
        # Pull data file size sum from BigQuery
        bq_project = dataset_details["access_information"]["big_query"]["project_id"]
        bq_schema = dataset_details["access_information"]["big_query"]["dataset_name"]
        client = bigquery.Client()
        file_size_query = """SELECT REGEXP_EXTRACT(uri, r'gs:\/\/([a-z0-9\-]+)') AS bucket, COUNT(*) AS file_count, COALESCE(SUM(size_in_bytes),0) AS file_size FROM `{project}.{schema}.file_inventory` GROUP BY bucket""".format(project = bq_project, schema = bq_schema)
        try:
            df_output = client.query(file_size_query).result().to_dataframe()
            for i in range(0, len(df_output)):
                source_bucket = df_output["bucket"].values[i]
                file_count = df_output["file_count"].values[i]
                byte_size = df_output["file_size"].values[i]
                status = "Success"
                record = [dataset_entry.id, dataset_entry.name, source_workspace, source_bucket, file_count, byte_size, status]
                records_list.append(record)
        except:
            source_bucket = ""
            file_count = 0
            byte_size = 0
            status = "Error"
            record = [dataset_entry.id, dataset_entry.name, source_workspace, source_bucket, file_count, byte_size, status]
            records_list.append(record)
        
# Read records into a dataframe
df = pd.DataFrame(records_list, columns =["Dataset UUID", "Dataset Name", "Source Workspaces", "Source Bucket", "File Count", "File Size (Bytes)", "Retrieval Status"])
df["File Size (Bytes)"] = df["File Size (Bytes)"].astype(int).astype(str)
df_sorted = df.sort_values(["Source Workspaces", "File Size (Bytes)"], ascending=[True, False], ignore_index=True)
print(f"End time: {datetime.datetime.now()}")
display(df_sorted)

## Pulling file counts and sizes from WS Buckets

In [None]:
# List of buckets:
bucket_list = [
'fc-secure-0075565e-7b76-4eaa-86e4-84a16acd7aba',
'fc-secure-cb321316-7166-4147-979c-5adf12904f30',
'fc-secure-8db2f74a-d4c8-47b9-8deb-bfadbf40ed18',
'fc-secure-276f37f8-c140-4502-9466-30a9be4e0e25',
'fc-secure-d408d9eb-a259-4598-b49d-fc3efafd13de',
'fc-secure-7e69c896-d6c0-4a4e-8490-42cb2d4fdebf',
'fc-secure-9e3357c0-389c-41d7-94ee-56673db6b75f',
'fc-secure-22d0b958-89a2-40be-91a5-efd0a24ccca6',
'fc-secure-320b42bf-eb47-4629-97e7-0ebaf188a091',
'fc-secure-fb033efd-49fe-4487-8b85-79c7b1c28384',
'fc-secure-45ba0648-12a1-4196-be6e-2a15ca834ca6',
'fc-secure-29c3d060-ca72-4de0-b87a-d45aa093ae1d',
'fc-secure-5b25667a-625b-4b0c-8ca2-b488dfce53c3',
'fc-secure-a9e585f8-4539-4d20-accf-d10790dd09d7',
'fc-secure-8ce36ffb-ad87-4942-abdc-2c0c6ce28483',
'fc-secure-356259df-0d87-4ad9-9cfd-0ef7947aeafc',
'fc-secure-87dd2b67-d7fc-49cb-8da9-eafa341cc1fb',
'fc-secure-be182c9d-e20a-43aa-b158-39113ea47705',
'fc-secure-3cbd4d3d-7331-46f9-a98f-ebba0a894562',
'fc-secure-905ccfc2-3a4d-4de7-8fe0-3ff6e1bc27ac',
'fc-secure-d0b94591-646e-4112-9640-9f8b688a222a',
'fc-secure-7cd273e4-2240-474d-aa8b-d02807b380e7',
'fc-secure-7171c5b1-2c83-4dfc-878a-f427ed7397f3',
'fc-secure-e58ec1b0-051d-4577-a85d-7c55ae2c0c51',
'fc-secure-2662b65b-4fec-48d0-bad8-e59e0349e581',
'fc-secure-3a248261-5349-4669-aa8e-9494ccb44c60',
'fc-secure-180323ab-f749-4063-ae83-3bb93c739046',
'fc-secure-bbac96b8-17df-4f33-9e42-5c9b6784e333',
'fc-secure-84b45515-60e8-4e08-9d0b-a960a153f66e',
'fc-secure-55d18a32-ae61-41e5-897a-846a95d97758',
'fc-secure-34427938-7ee7-44a8-9258-5b979d5a0c98',
'fc-db8d28e8-e27d-4c0c-8559-2ac15d4f82c9',
'fc-9ee5368b-50df-44ab-86ab-20d34db6bbcb',
'fc-97e826b0-7f75-4c91-9a42-955967e87a1a',
'fc-64b1886e-5c5b-4f3e-8518-6c4f0cff22b1',
]

# Loop through buckets and record size and file count
print(f"Start time: {datetime.datetime.now()}")
results = []
for bucket in bucket_list:
    start = time.time()
    obj_list = []
    file_count = 0
    size = 0
    try:
        storage_client = storage.Client()
        storage_bucket = storage_client.bucket(bucket, user_project="anvil-datastorage")
        objects = list(storage_client.list_blobs(storage_bucket))
        file_count = len(objects)
        for i in range(0,file_count): size += objects[i].size
        status = "Success"
        fail_message = ""
    except Exception as e:
        status = "Failure"
        fail_message = f"; Fail Message: {str(e)}"
    end = time.time()
    duration = round(end-start,2)
    message = f"Duration: {duration}s{fail_message}"
    results.append([bucket, size, file_count, status, message])
    df_temp = pd.DataFrame([[bucket, size, file_count, status, message]], columns =["Bucket", "Size in Bytes", "Object Count", "Run Status", "Message"])
    display(df_temp)
print("---------------------------------------------------------------------------------")
print("---------------------------------------------------------------------------------")
print("---------------------------------------------------------------------------------")  
df = pd.DataFrame(results, columns =["Bucket", "Size in Bytes", "Object Count", "Run Status", "Message"])
print(f"End time: {datetime.datetime.now()}")
display(df)

# Pulling MD5 Population Across AnVIL

## Pulling High Level MD5 Population Stats

In [None]:
# Define parameters
billing_profile = "e0e03e48-5b96-45ec-baa4-8cc1ebf74c61"

# Establish API client
creds, project = google.auth.default()
auth_req = google.auth.transport.requests.Request()
creds.refresh(auth_req)
config = data_repo_client.Configuration()
config.host = "https://data.terra.bio"
config.access_token = creds.token
api_client = data_repo_client.ApiClient(configuration=config)
api_client.client_side_validation = False
datasets_api = data_repo_client.DatasetsApi(api_client=api_client)

# Loop through enumerated datasets and create records for those related to AnVIL
print(f"Start time: {datetime.datetime.now()}")
records_list = []
datasets_list = datasets_api.enumerate_datasets(limit=2000)
for dataset_entry in datasets_list.items:
    if dataset_entry.default_profile_id == billing_profile:
        # Retrieve dataset details and pull source workspace(s)
        dataset_details = datasets_api.retrieve_dataset(id=dataset_entry.id, include=["ACCESS_INFORMATION", "PROPERTIES"]).to_dict()
        try:
            source_workspace = ",".join(dataset_details["properties"]["source_workspaces"])
        except:
            source_workspace = ""
        
        # Pull MD5 summary stats from TDR
        bq_project = dataset_details["access_information"]["big_query"]["project_id"]
        bq_schema = dataset_details["access_information"]["big_query"]["dataset_name"]
        client = bigquery.Client()
        file_size_query = """SELECT COUNT(*) AS file_count, COUNT(md5_hash) AS file_w_md5 FROM `{project}.{schema}.file_inventory`""".format(project = bq_project, schema = bq_schema)
        try:
            df_output = client.query(file_size_query).result().to_dataframe()
            file_count = df_output["file_count"].values[0]
            file_w_md5 = df_output["file_w_md5"].values[0]
            if file_count != file_w_md5:
                missing_md5 = True
            else:
                missing_md5 = False
            status = "Success"
        except:
            file_count = 0
            file_w_md5 = 0
            missing_md5 = False
            status = "Error"
    
        # Build record for dataset
        record = [dataset_entry.id, dataset_entry.name, source_workspace, file_count, file_w_md5, missing_md5, status]
        records_list.append(record)
        
# Read records into a dataframe
df = pd.DataFrame(records_list, columns =["Dataset UUID", "Dataset Name", "Source Workspaces", "File Count", "MD5 Populated Count", "Missing MD5s", "Retrieval Status"])
df_sorted = df.sort_values(["Source Workspaces"], ascending=[True], ignore_index=True)
print(f"End time: {datetime.datetime.now()}")
display(df_sorted)

# Pulling Specific Files Across AnVIL

## Pulling Specific Problematic Files

In [None]:
# Define parameters
billing_profile = "e0e03e48-5b96-45ec-baa4-8cc1ebf74c61"

# Establish API client
creds, project = google.auth.default()
auth_req = google.auth.transport.requests.Request()
creds.refresh(auth_req)
config = data_repo_client.Configuration()
config.host = "https://data.terra.bio"
config.access_token = creds.token
api_client = data_repo_client.ApiClient(configuration=config)
api_client.client_side_validation = False
datasets_api = data_repo_client.DatasetsApi(api_client=api_client)

# Loop through enumerated datasets and create records for those related to AnVIL
print(f"Start time: {datetime.datetime.now()}")

df_results = pd.DataFrame(columns = ["Dataset UUID", "Dataset Name", "Source Workspaces", "File Path", "Byte Size", "Retrieval Status"])
records_list = []
datasets_list = datasets_api.enumerate_datasets(limit=2000)
for dataset_entry in datasets_list.items:
    if dataset_entry.default_profile_id == billing_profile:
        # Retrieve dataset details and pull source workspace(s)
        dataset_details = datasets_api.retrieve_dataset(id=dataset_entry.id, include=["ACCESS_INFORMATION", "PROPERTIES"]).to_dict()
        try:
            source_workspace = ",".join(dataset_details["properties"]["source_workspaces"])
        except:
            source_workspace = ""
        
        # Pull data files with null MD5s
        bq_project = dataset_details["access_information"]["big_query"]["project_id"]
        bq_schema = dataset_details["access_information"]["big_query"]["dataset_name"]
        client = bigquery.Client()
        file_size_query = """SELECT uri AS file_name, size_in_bytes AS file_size FROM `{project}.{schema}.file_inventory` WHERE md5_hash IS NULL""".format(project = bq_project, schema = bq_schema)
        try:
            output = client.query(file_size_query).result()
            if output.total_rows > 0:
                df_output = output.to_dataframe()
                df_output.rename(columns = {"file_name":"File Path", "file_size":"Byte Size"}, inplace = True)
                df_output["Dataset UUID"] = dataset_entry.id
                df_output["Dataset Name"] = dataset_entry.name
                df_output["Source Workspaces"] = source_workspace
                df_output["Retrieval Status"] = "Success - Files Found"
                df_results = df_results.append(df_output)
            else:
                output = [[dataset_entry.id, dataset_entry.name, source_workspace, None, 0, "Success - No Files Found"]]
                df_output = pd.DataFrame(output, columns = ["Dataset UUID", "Dataset Name", "Source Workspaces", "File Path", "Byte Size", "Retrieval Status"])
                df_results = df_results.append(df_output)
        except:
            output = [[dataset_entry.id, dataset_entry.name, source_workspace, None, 0, "Error"]]
            df_output = pd.DataFrame(output, columns = ["Dataset UUID", "Dataset Name", "Source Workspaces", "File Path", "Byte Size", "Retrieval Status"])
            df_results = df_results.append(df_output)
        
# Sort dataframe records and write out to file
df_sorted = df_results.sort_values(["Source Workspaces", "File Path"], ascending=[True, True], ignore_index=True)
output_file_path = "null_md5_files.tsv"
df_sorted.to_csv(output_file_path, index=False, sep="\t")
!gsutil cp $output_file_path $ws_bucket/ingest_pipeline/resources/ 2> stdout
!rm $output_file_path
print(f"End time: {datetime.datetime.now()}")
print(f"Results copied to: {ws_bucket}/ingest_pipeline/resources/{output_file_path}")

# Examining Target Paths Across AnVIL

## Pulling Target Paths Across AnVIL and Looking for Embedded Buckets

In [None]:
# Define parameters
billing_profile = "e0e03e48-5b96-45ec-baa4-8cc1ebf74c61"

# Establish API client
creds, project = google.auth.default()
auth_req = google.auth.transport.requests.Request()
creds.refresh(auth_req)
config = data_repo_client.Configuration()
config.host = "https://data.terra.bio"
config.access_token = creds.token
api_client = data_repo_client.ApiClient(configuration=config)
api_client.client_side_validation = False
datasets_api = data_repo_client.DatasetsApi(api_client=api_client)

# Loop through enumerated datasets and create records for those related to AnVIL
print(f"Start time: {datetime.datetime.now()}")
records_list = []
datasets_list = datasets_api.enumerate_datasets(limit=2000)
for dataset_entry in datasets_list.items:
    if dataset_entry.default_profile_id == billing_profile:
        # Retrieve dataset details and pull source workspace(s)
        dataset_details = datasets_api.retrieve_dataset(id=dataset_entry.id, include=["ACCESS_INFORMATION", "PROPERTIES"]).to_dict()
        
        # Pull MD5 summary stats from TDR
        bq_project = dataset_details["access_information"]["big_query"]["project_id"]
        bq_schema = dataset_details["access_information"]["big_query"]["dataset_name"]
        client = bigquery.Client()
        file_size_query = """SELECT COUNT(*) AS file_count FROM `{project}.{schema}.datarepo_load_history` WHERE target_path LIKE '/fc-%'""".format(project = bq_project, schema = bq_schema)
        try:
            df_output = client.query(file_size_query).result().to_dataframe()
            if df_output["file_count"].values[0] > 0:
                bad_paths = True
            else:
                bad_paths = False
            status = "Success"
        except:
            bad_paths = False
            status = "Error"
    
        # Build record for dataset
        record = [dataset_entry.id, dataset_entry.name, bad_paths, status]
        records_list.append(record)
        
# Read records into a dataframe
df = pd.DataFrame(records_list, columns =["Dataset UUID", "Dataset Name", "Bad Target Paths", "Retrieval Status"])
df_sorted = df.sort_values(["Dataset Name"], ascending=[True], ignore_index=True)
print(f"End time: {datetime.datetime.now()}")
display(df_sorted)

# Looking for duplicate records across AnVIL

In [None]:
# Define parameters
billing_profile = "e0e03e48-5b96-45ec-baa4-8cc1ebf74c61"
target_table = "sample"
key_column = "sample_id"

# Establish API client
creds, project = google.auth.default()
auth_req = google.auth.transport.requests.Request()
creds.refresh(auth_req)
config = data_repo_client.Configuration()
config.host = "https://data.terra.bio"
config.access_token = creds.token
api_client = data_repo_client.ApiClient(configuration=config)
api_client.client_side_validation = False
datasets_api = data_repo_client.DatasetsApi(api_client=api_client)

# Loop through enumerated datasets and create records for those related to AnVIL
print(f"Start time: {datetime.datetime.now()}")
records_list = []
datasets_list = datasets_api.enumerate_datasets(limit=2000)
for dataset_entry in datasets_list.items:
    if dataset_entry.default_profile_id == billing_profile:
        # Retrieve dataset details and pull source workspace(s)
        dataset_details = datasets_api.retrieve_dataset(id=dataset_entry.id, include=["ACCESS_INFORMATION", "PROPERTIES"]).to_dict()
        
        # Pull duplicate records
        bq_project = dataset_details["access_information"]["big_query"]["project_id"]
        bq_schema = dataset_details["access_information"]["big_query"]["dataset_name"]
        client = bigquery.Client()
        dupe_query = """SELECT COUNT(*) AS dupe_count FROM (SELECT {pk_col} FROM `{project}.{schema}.{table}` GROUP BY {pk_col} HAVING COUNT(*) > 1)""".format(project = bq_project, schema = bq_schema, table = target_table, pk_col = key_column)
        try:
            df_output = client.query(dupe_query).result().to_dataframe()
            dupe_count = df_output["dupe_count"].values[0]
            if dupe_count > 0:
                duplicates_found = True
            else:
                duplicates_found = False
            status = "Success"
        except:
            duplicates_found = False
            status = "Error"
    
        # Build record for dataset
        record = [dataset_entry.id, dataset_entry.name, target_table, key_column, duplicates_found, dupe_query, status]
        records_list.append(record)
        
# Read records into a dataframe
df = pd.DataFrame(records_list, columns =["Dataset UUID", "Dataset Name", "Table Name", "Key Column", "Duplicates", "Query", "Status"])
df_sorted = df.sort_values(["Dataset Name"], ascending=[True], ignore_index=True)
print(f"End time: {datetime.datetime.now()}")
display(df_sorted)

# Looking for datasets with malformed file relationships

In [None]:
def validate_file_activities(dataset_id):
    
    # Retrieve dataset information
    src_schema_dict = {}
    api_client = utils.refresh_tdr_api_client()
    datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
    client = bigquery.Client()
    try:
        response = datasets_api.retrieve_dataset(id=dataset_id, include=["SCHEMA", "ACCESS_INFORMATION"]).to_dict()
        src_schema_dict["tables"] = response["schema"]["tables"]
        bq_project = response["access_information"]["big_query"]["project_id"]
        bq_dataset = response["access_information"]["big_query"]["dataset_name"]
    except Exception as e:
        return "Failure - Issue Retrieving Dataset Info"
    
    # Check files for duplicate names
    file_query = """
        SELECT COUNT(*) file_count, COUNT(DISTINCT file_name) AS distinct_file_names
        FROM `{project}.{dataset}.anvil_file`
        """.format(project=bq_project, dataset=bq_dataset)
    try:
        df = client.query(file_query).result().to_dataframe()
        if not df.empty:
            file_count = df["file_count"].values[0]
            distinct_name_count = df["distinct_file_names"].values[0] 
            if file_count == distinct_name_count:
                return "Success - All file names are distinct"
        else:
            return "Success - No files in dataset"  
    except Exception as e:
        return "Failure - BigQuery Error" 
    
    # Check activities
    activity_query = """
        WITH activity_flattened AS
        (
          SELECT DISTINCT generated_file, activity_type, used_file
          FROM `{project}.{dataset}.anvil_activity`
            CROSS JOIN UNNEST(used_file_id) AS used_file
            CROSS JOIN UNNEST(generated_file_id) AS generated_file
          WHERE ARRAY_LENGTH(used_biosample_id) = 0
        ),
        activity_agg AS
        (
          SELECT generated_file, activity_type, COUNT(DISTINCT used_file)
          FROM activity_flattened
          GROUP BY generated_file, activity_type
          HAVING COUNT(DISTINCT used_file) > 1
        )
        SELECT *
        FROM 
        (
          SELECT 'Files generated from multiple file activities (Activity Type - All)' AS metric, COUNT(DISTINCT generated_file) AS result 
          FROM activity_agg
          UNION ALL
          SELECT 'Files generated from multiple file activities (Activity Type - ' || activity_type || ')' AS metric, COUNT(DISTINCT generated_file) AS result 
          FROM activity_agg
          GROUP BY activity_type
        )
        ORDER BY metric
        """.format(project=bq_project, dataset=bq_dataset)
    try:
        df = client.query(activity_query).result().to_dataframe()
        if df.empty or (len(df) == 1 and df["result"].values[0] == 0):
            return "Success - No files generated from multiple file activities"
        else:
            records_json = json.loads(df.to_json(orient='records'))
            total_file_count = 0
            index_file_count = 0
            checksum_file_count = 0
            unknown_file_count = 0
            for record in records_json:
                if record["metric"] == "Files generated from multiple file activities (Activity Type - All)":
                    total_file_count = record["result"]
                elif record["metric"] == "Files generated from multiple file activities (Activity Type - Indexing)":
                    index_file_count = record["result"]
                elif record["metric"] == "Files generated from multiple file activities (Activity Type - Checksum)":
                    checksum_file_count = record["result"]
                else:
                    unknown_file_count = record["result"]
            err_msg = f"Failure - Files generated from multiple file activities. All: {str(total_file_count)} Indexing Activities: {str(index_file_count)} Checksum Activities: {str(checksum_file_count)} Unknown Activities: {str(unknown_file_count)}"
            return err_msg   
    except Exception as e:
        return "Failure - BigQuery Error" 

# Loop through datasets and validate is_supplementary field
dataset_id_list = [
    ''
]
results = []
for dataset_id in dataset_id_list:
    logging.info(f"Validating dataset_id: {dataset_id}")
    status = validate_file_activities(dataset_id) 
    results.append([dataset_id, status])
    results_df = pd.DataFrame(results, columns = ["dataset_id", "validation_status"])
display(results_df)

# Looking for anvil_activity records with malformed source_datarepo_row_ids

In [None]:
def validate_activities(dataset_id):
    
    # Retrieve dataset information
    src_schema_dict = {}
    api_client = utils.refresh_tdr_api_client()
    datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
    client = bigquery.Client()
    try:
        response = datasets_api.retrieve_dataset(id=dataset_id, include=["SCHEMA", "ACCESS_INFORMATION"]).to_dict()
        src_schema_dict["tables"] = response["schema"]["tables"]
        bq_project = response["access_information"]["big_query"]["project_id"]
        bq_dataset = response["access_information"]["big_query"]["dataset_name"]
    except Exception as e:
        return "Failure - Issue Retrieving Dataset Info"
    
    # Check activities for malformed source_datarepo_row_ids
    activities_query = """
        SELECT COUNT(*) AS bad_record_count
        FROM `{project}.{dataset}.anvil_activity`, UNNEST(source_datarepo_row_ids) AS source_datarepo_row_id
        WHERE source_datarepo_row_id NOT LIKE '%:%'
        """.format(project=bq_project, dataset=bq_dataset)
    try:
        df = client.query(activities_query).result().to_dataframe()
        if not df.empty:
            bad_record_count = df["bad_record_count"].values[0]
            if bad_record_count > 0:
                return "Failure - Malformed source_datarepo_row_ids detected"
            else:
                return "Success - No malformed source_datarepo_row_ids detected" 
        else:
            return "Success - No malformed source_datarepo_row_ids detected"  
    except Exception as e:
        return "Failure - BigQuery Error" 

# Loop through datasets and validate is_supplementary field
dataset_id_list = [
    ''
]
results = []
for dataset_id in dataset_id_list:
    logging.info(f"Validating dataset_id: {dataset_id}")
    status = validate_activities(dataset_id) 
    results.append([dataset_id, status])
    results_df = pd.DataFrame(results, columns = ["dataset_id", "validation_status"])
display(results_df)

# Examine source_datarepo_row_id Referential Integrity across AnVIL

In [None]:
#############################################
## Functions
#############################################

def validate_source_datarepo_row_ids(dataset_id, table, source_table):

    # Retrieve dataset information
    table_list = []
    api_client = utils.refresh_tdr_api_client()
    datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
    client = bigquery.Client()
    try:
        response = datasets_api.retrieve_dataset(id=dataset_id, include=["SCHEMA", "ACCESS_INFORMATION"]).to_dict()
        src_schema_tables = response["schema"]["tables"]
        for src_schema_table in src_schema_tables:
            table_list.append(src_schema_table["name"])
        bq_project = response["access_information"]["big_query"]["project_id"]
        bq_dataset = response["access_information"]["big_query"]["dataset_name"]
    except Exception as e:
        return "Failure", "Issue retrieving dataset info"
    
    # Check that tables to validate are both in the dataset
    if table not in table_list:
        return "Skipped", "Table not present in dataset"
    if source_table not in table_list:
        return "Skipped", "Source table not present in dataset"
    
    # Check for bad source_datarepo_row_ids linkage
    validation_query = f"""
        WITH datarepo_row_id_list 
        AS
        (
          SELECT DISTINCT REPLACE(src_datarepo_row_id, '{source_table}:', '') AS datarepo_row_id
          FROM `{bq_project}.{bq_dataset}.{table}` t1 
          CROSS JOIN UNNEST(source_datarepo_row_ids) AS src_datarepo_row_id
          WHERE src_datarepo_row_id LIKE '{source_table}%'
        )
        SELECT COUNT(*) AS dangling_row_id_count
        FROM datarepo_row_id_list
        WHERE datarepo_row_id NOT IN (SELECT datarepo_row_id FROM `{bq_project}.{bq_dataset}.{source_table}`)"""
    try:
        df = client.query(validation_query).result().to_dataframe()
        if not df.empty:
            dangling_row_id_count = df["dangling_row_id_count"].values[0]
            if dangling_row_id_count > 0:
                return "Failure", "Dangling source_datarepo_row_ids detected"
            else:
                return "Success", "No dangling source_datarepo_row_ids detected" 
        else:
            return "Success", "No dangling source_datarepo_row_ids detected"  
    except Exception as e:
        return "Failure", "BigQuery Error" 

#############################################
## Input Parameters
#############################################

# Input the relationships to examine:
relationship_list = [
    #["table", "source_table"]
    ["anvil_donor", "subject"],
    ["anvil_donor", "participant"],
    ["anvil_donor", "anvil_biosample"],
    ["anvil_biosample", "sample"],
    ["anvil_biosample", "subject"],
    ["anvil_biosample", "participant"],
    ["anvil_biosample", "anvil_sequencingactivity"],
    ["anvil_biosample", "anvil_activity"],
    ["anvil_diagnosis", "subject"],
    ["anvil_sequencingactivity", "sequencing"],
    ["anvil_file", "file_inventory"],
    ["anvil_dataset", "workspace_attributes"],
    ["anvil_project", "workspace_attributes"],
    ["anvil_activity", "sample"],
    ["anvil_activity", "file_inventory"],
    ["anvil_activity", "sample"],
    ["anvil_activity", "participant"],
    ["anvil_activity", "qc_result_sample"]  
]

# Input the datasets to examine:
dataset_id_list = [
    'cc19d19e-6f7e-41b8-87a6-77f41d53e650',
    '16031a34-f1ba-4bde-af43-1822f1516944',
    '6765ce2d-ebc8-4367-8855-c0f8e62cb355',
    '93b2ac60-2208-4ef8-a1c2-68a623e45807',
    'a963c15d-9c97-49e4-af95-cdee96333a76',
    'fb5d9952-ebe7-4ee6-ba00-819ed00f3593',
    'cb8ebcd0-bb5e-4a6d-bfef-5c651a1a9f6e',
    '1939b7ae-fc6b-42a8-ad5f-dc51a1682a17',
    '2cda53ba-b852-47e8-8f24-59ab8e9f1d1f',
    '4999a410-990e-484b-b4f3-d636f894a741',
    'd01a4268-1bfe-4a2d-a2d4-e296162c406e',
    '039dd3d6-0cb5-4cd1-86b3-e9579c9b5218',
    'd0ce8b95-9c3b-4f9e-8ce0-169fd89a8b20',
    '9d796a02-e2aa-4c15-b8d6-1e90cd736681',
    '433e3a09-661a-46a5-96f2-dbb07bdc87f3',
    '65793118-3c88-4185-9172-2354850e6056',
    'fcb03f4f-e685-4803-aadb-0e8940ff4f37',
    'c2f0e7cf-ac07-48f7-b5f1-497ee6c134b2',
    '3abfc362-7e73-4663-9dcf-07b78b9aa2d4',
    '8de6dae2-55ff-4287-9b75-5b2a950c1f44',
    'd48db47e-acba-4377-b031-f6dfc21f3658',
    '3fd2204c-8654-4af7-832f-c186447262e0',
    'ae34e63e-13af-48b8-8b72-8137289091b3',
    'd3ed2595-b8be-40c8-b7b6-10a4997b9d2e',
    '575dc7da-58ed-407d-9e88-7b586f28bf65',
    '61803dc8-f649-43e5-ab15-d351f2cef629',
    'abe58d43-e1c7-4953-aa41-4d3b6f6cca44',
    'fa278604-7d85-4491-a30d-15c7821f8b00',
    'b5c0bf91-9d20-41a2-9dd2-87d0ef0310f9',
    '97c636f9-0983-481f-8ff9-7b5b3ee6b10e',
    '32bc49c6-7583-4613-a72f-5edb12b2a808',
    '3eb8ea77-4605-4bb7-90f9-671953abe4a2',
    '2b08cb76-061d-44c6-a00f-b43a5421df5e',
    'ab7e390a-adc5-4f9e-b317-a216a2904c93',
    'd1e6d0e4-d49e-4a16-93c6-7956b2c03414',
    '713f8676-8034-4827-bccc-cd6d95b1a4c4',
    'b00883d8-9251-435d-aefc-8a703d96d2fb',
    'eefbea02-0d65-441e-b455-35aa21d25ba3',
    'ff7e3be2-c0ac-4d97-85da-6229bf7585ac',
    '7ac92a42-e112-49c3-a8f5-8ad2c7ef5578',
    '0701aae2-8661-4eec-84e0-7c8be1c89a18',
    '7efb1905-34b4-4f1c-a8a6-8e64b3640a68',
    '11a2b088-8c1c-47d2-9c1e-455d457d2f05',
    '74608bd9-39e4-4f48-9b7c-1cd9d3c599c9',
    '7baf8e8c-de11-452d-b2e1-aad7c08cc18e',
    '23b0219d-0820-4017-b942-bda8578e90e2',
    'a5f631ea-2b4b-43f2-9ea0-e31f2b11fa27',
    '8523489a-f57c-4993-81e4-1ed86a5c092d',
    '395da421-e6e8-4a26-ac93-eb7050a7cb1f',
    '9cb5ce25-38e8-4628-9ddf-d6aedf5efe0f',
    '615f6246-1c39-4e44-a9d4-c7133a2ae62d',
    'a647528d-925e-4c02-8825-ff54720c6ee4',
    '2c6f63b2-439e-499f-b687-b3fdd88a492e',
    '68a916af-2e0c-41bd-8535-c7eacbc2d1b7',
    '0e7f31a0-c712-4ebf-ab3a-64c37f43e52a',
    'c8b1d323-f352-482e-bf17-82075c23dcee',
    'd30f51c7-d642-4e7d-a168-967b9520a80a',
    '8d89608c-0d61-4d71-a2e3-9fbc6cda69bf',
    '71219f56-551f-4ad4-9a38-cc4aaf8a1e9a',
    '48dd6010-77dc-465b-a27c-695e29b57a5e',
    '21384132-1697-4e9b-b863-a6492d13285d',
    '582f5f8d-b96f-490e-b417-ba824baeb06c',
    '7e825ee6-7c03-43cc-b0a4-0d9203a30bd9',
    '2843292e-e494-4642-90e0-57e5c153f12c',
    '4ecbb7c8-0246-47f8-9654-4caca1d52565',
    '7e3ea1bd-95ba-4cad-90c8-3eec95be9cc8',
    'bbba696b-d023-4bb1-a213-c8bee31e8bae',
    '470eee0f-2053-4d9b-9f5e-ca9661a6cc16',
    '5c1dc76d-b703-445c-9b38-cc2d00b9ab16',
    '608d793e-a78b-4872-a50c-21a9eaa60ec3',
    '74ede771-6781-4980-bfb9-5d853b7cdd6f',
    '6c47e282-5d5e-445c-b6bd-c0024946fbe0',
    'ff8ffbcf-c932-48c7-8d5e-d995d5680e21',
    '9d74b4f0-b2d4-46aa-867a-52fb6102bfdf',
    'e34f15f7-c225-4314-a638-90504bb0aa0d',
    'c1d222ab-bc0e-4e13-8379-0ee5be9e140e',
    '582187a5-ad63-4759-9162-55fa6337eb07',
    'd1e8d19a-970d-4ede-b5bc-9cab7237adec',
    'af867604-d801-41cc-9949-017eb30a0cbf',
    '722e332c-fb1a-45fe-80c7-cc670f025b7f',
    '9f152896-ebf1-4756-b678-bdf739a92256',
    '478aa270-fbd4-4a45-8f63-221b4066168e',
    '0b06619d-39d9-4437-8c42-2e415faa634c',
    'e9c7ad29-2213-4648-9164-33a07bd42cdb',
    '1d140c76-a06b-42a0-bae8-b9e169ebe394',
    '9f7dbe05-96b5-4b2f-9f3a-34b552e3dd21',
    '9e3fb02d-dcf6-486f-a42d-89446a852057',
    '15ae6390-6f6d-4fd8-9a51-ecf988676c4d',
    '3a3100bb-369e-47c1-a77c-2cacb7cf020d',
    'e642bca0-52fb-4ab3-ab3a-acaab83deda7',
    '2c11b505-17c8-402e-8422-0239accb449d',
    'a6d7e030-e6c8-4c62-8cb5-165ef54987c4',
    'e25a8172-1e34-442c-a45d-583027a2d734',
    'c911503c-f010-4c17-ac57-1d82e954bdc7',
    '128dce74-fa37-4f2f-8a80-d542edd81a11',
    '841970b7-bed0-4a75-a28a-a4cc59740a84',
    'a5f53fc8-8f9b-4e9a-af63-6f8c54d478b2',
    'f461fca1-80b2-4980-83a8-e165d49acc18',
    '37f0f1f9-83fb-49a1-9941-093c068c32d0',
    'cfb3dad7-c6d9-47c0-81b0-2133d75f5c0d',
    'c5c0893f-b254-4038-8d08-b28ef5a26b5d',
    'bbcf8529-1a04-43fc-b6cf-cb161028159d',
    '631deea0-2821-4d14-ad02-dc0ce4864924',
    'eede320a-ed63-41d8-960d-5405a26a194f',
    '36dccf81-6932-43ae-9864-53379832d878',
    '01eaf423-8cab-491a-b82e-6915dbc73594',
    '0481a135-9db1-424f-9065-a83ebd7ec995',
    'b60876c5-d825-4303-befb-ffff55b92aba',
    '64f2dbe1-6f58-493d-ab6b-c93568d828f4',
    'a9626803-72c2-4e23-968c-a090e3f22c5e',
    '902596ce-714e-49b3-8271-f3dfece52309',
    'f5f29e4b-68f7-443c-b290-0827d4167fd5',
    '6f49717f-8f57-42d0-8548-316ecc292415',
    '7e693091-8ae4-4c40-8e66-c3b39f01b90e',
    '5137255f-0c58-4ac7-9266-bda8ab0247c2',
    '278a26cb-a710-4fff-928e-fc2e7084a75a',
    '9fc492f3-8d13-47ae-93e9-812c0224f1aa',
    '822d381e-cea0-45bb-8fa0-1b7194b4b64b',
    '44161b51-953d-4f6b-9448-5cba4a44a9d8',
    '0b90b2ea-8ca3-406a-9f69-95eddf7699ef',
    '8cc59f51-b0df-4a5d-a3c5-83ee526ff1af',
    '85646f4a-e424-4363-8033-1e7522e8f175',
    '76dd508c-aa80-4e54-9ac4-23b5e0545316',
    '1c6bef41-3cfa-46b2-b183-0a523e417457',
    '6905d8d1-da77-4f7c-86e5-3af7db2b00b4',
    '2a81cd6f-aa6e-436b-b4ba-68d5f713fb07',
    'efcdb584-7659-4780-9d6d-e6599fb0033c',
    '352a503b-41eb-4a84-b257-68d70e55337e',
    '737d39b8-2f99-4eac-bcda-a03996e08939',
    '31e61d00-61cc-46f2-a793-8ea8dfbb0832',
    'af6c6f09-f0d2-46fe-bda0-c6fa5901c4a3',
    '583023a1-aa12-40e2-a964-8ad50ad400ba',
    'c56f0a76-2b91-4860-8dff-63c9504bb0e2',
    '5203f051-7e84-4969-b4ce-eda56a859793',
    '488a38ee-f996-482d-a562-a4474f5594de',
    '73f7d2b4-86ec-4f7e-a1f9-37c7b023e3bf',
    '732eaae3-b509-4a7a-8961-09d861e55253',
    '279e5670-8a47-4992-bb10-14e6c719db97',
    '525a9535-74a3-4757-9507-52a684cf5647',
    'd306000b-88c1-4220-8d7e-933c0118a983',
    'ab76b5ca-e464-4063-b949-853f61036370',
    '672b617f-936e-440a-a735-80f94798aed1',
    '516ceb43-1378-4c02-88fc-a1d2a2258d59',
]

#############################################
## Execution
#############################################

results = []
for dataset_id in dataset_id_list:
    logging.info(f"Validating dataset_id: {dataset_id}")
    for relationship in relationship_list:
        table = relationship[0]
        source_table = relationship[1] 
        status, msg = validate_source_datarepo_row_ids(dataset_id, table, source_table) 
        results.append([dataset_id, table, source_table, status, msg])
print("Full results:")
results_df = pd.DataFrame(results, columns = ["dataset_id", "table", "source_table", "validation_status", "message"])
display(results_df)
failures_df = results_df[results_df["validation_status"].str.contains("Failure")]
failures_agg_df = failures_df.groupby('dataset_id')['table'].apply(set).reset_index()       
print("Aggregated results:")
display(failures_agg_df)


# Examine and validate file extensions

## Examine file extensions in AnVIL data

In [None]:
# Define parameters
billing_profile = "e0e03e48-5b96-45ec-baa4-8cc1ebf74c61"
dataset_id_list = [
    'cefc1a79-446c-40d2-b140-ba8d8b1c0712',
    '4e699ead-bbb5-460d-9b32-2b1b322c601b',
    '2355554e-8951-4b41-bcd8-32e18cddb7c9',
    'bafbf771-1cd2-44fc-9b38-5a4bbead8ab2',
    'a36eeaf7-d6dd-4887-bdbd-e435a07ba156',
    '8de6dae2-55ff-4287-9b75-5b2a950c1f44',
    'ce6692aa-0f97-48fa-8628-b8fa3eab4726',
    '31433635-91d4-431d-8d26-bc54e84c8e8c',
    '0b06619d-39d9-4437-8c42-2e415faa634c',
    '12ffb586-5f6a-4f0a-a353-d2f34599f4cc',
    'e642bca0-52fb-4ab3-ab3a-acaab83deda7',
    'b7fb531e-25a4-427c-9679-b7bdc3d03535',
    '3615e063-f24b-47f7-87cb-430e8aca8d0c',
    'a3ea4f97-6657-4d3c-9be6-96f097f5c952',
    '9f4ac69c-0919-4ac1-98a8-976ed79ace03',
    '96461004-f4b3-4f82-a842-293b3ec46a60',
    '841b7883-9447-4ea0-ae4a-84ea0240d919',
    'd0fc3d6a-c3f4-4533-8a23-817a4e27f9be',
    '2cace5dc-f660-45d4-b689-c4c89e77697c',
    'd2272f2d-c606-4027-b8ea-0bdd6d9d6535',
    'b12fb9be-2ce0-4bfd-8503-732fabba06ab',
    '9f9fc99a-b867-49a9-a3dc-8a39efbd5fa2',
    'ce58654d-b7d3-466b-99ba-b203d527a543',
    '179eb85e-2557-4677-9cba-d763310f3df9',
    'cba804c9-0bdd-4219-a53e-98c8db6334a0',
    'd239dd7b-8d10-4960-aa91-8f8ede641e25',
    '5c6a1c4f-ccd3-48a8-ac00-e18e5ecaa0bb',
    '19e2c8ab-853a-4204-86c3-f591125fbf63',
    '7cf0d3d0-f79b-4bfe-bfc8-e4e6c33dd4c3',
    '3a72e4b8-afb4-4299-98ec-a9ba9606be06',
    'f3c89298-0dd2-40da-8627-3baea553b34a',
    '9a32e23e-840d-4ba3-8cd9-392f48b8e9d2',
    'c5d967fd-09ce-4b02-97dd-ac3abf6f79fa',
    '5069fc2c-b957-4130-adca-6eabae943867',
    '173e56f7-b813-4c41-89ff-09a824e1407f',
    '80312f74-bd56-4938-96ba-e9bed95d1f3b',
    '017445d7-d56e-4e2e-b480-b4879b51e944',
    '13b2076a-cfe1-49ec-ac61-bad1af9a52ea',
    '175dd803-02c7-4823-81d5-9e0621652ace',
    '15492baa-05ed-47bc-b50c-e587679ae51a',
    '33705ce9-b2b3-4edc-9b47-f54283e193cf',
    '92486440-3a46-44dd-b853-b300ef75b31e',
    'dd2cb8fc-42a6-482f-898e-ef6125feccb8',
    '4e99b8e1-40b9-4fb2-90a0-d85e926ef31e',
    '128332b6-5060-4ec4-b6a6-f53b54a810be',
    '06f05f58-3c83-4f5c-bddd-bed7d2d1d147',
    '51e9935f-ec18-4832-801a-6d9186537572',
    'd6291444-8c3c-470c-b28c-7cf1d5c7aad8',
    '41cb9f29-4ba6-4690-821c-cb085e6b0f2f',
    'e68d1d39-99df-4cd7-8053-1b298f03eabb',
    'b252e3ac-4a8c-48e0-9999-5ee0c9a5842d',
    '7ea006d9-1e19-4678-b2e6-d4a1ea327f74',
    '34fd3b22-ac73-47d2-8849-5877158ec072',
    'a08dc7a6-f8ce-4205-95d2-83f614c2c32f',
    '577f36fe-8154-4c82-ac87-b2a64cb68f35',
    '7ce3270e-b2f2-47f4-a288-639751b2f87f',
    '36bdd59f-4f5b-43cd-8d34-a21ef87bbf30',
    '41d12dc1-8718-4439-b409-26cc23573107',
    'c4c49fcd-0c20-4cff-841a-cb58f5689c5b',
    '6b40557c-ddc3-4e7e-8a45-1761e7fcb8b5',
    'd6518df9-fc11-46ed-9c12-b9782d3829a0',
    '9ee2a552-89f8-4a48-9c94-9fa26ebb7483',
    '425412ba-894a-4824-acb8-bf18fe4576e0',
    'f22bd762-5c45-453e-bf22-b174514abb84',
    '0ee62643-b064-42f8-9b09-5d10eacd70a3',
    '1a7f6728-5116-4f24-897a-59a7f322cfd2',
    'c37b388c-7107-43d6-bee6-4e82b40ed271',
    'bf6f1d78-6a0d-4afb-aea6-17a3c34340db',
    'a3becdde-018b-46f0-adea-d587076eef4a',
    'a9ad3a05-24fb-4e59-85b0-ee09e55a4492',
    '719f7581-21db-4aec-8c46-4a5811832710',
    '3725b660-1106-4173-9c4b-0a15926becf5',
    '318a75f4-ac50-4944-81b0-70a1323e7497',
    '75fb0984-2124-444f-881b-30a1a6f8b8f7',
    '15be288e-53e1-41cb-8d20-8ea87efb9258',
    '700303c2-fcef-48a5-9900-096bf34e2d83',
    '38fd20ce-affd-4791-9810-7f5a7fe876d0',
    '8b8185d3-ba5c-4832-af23-3ff8ca6ed016',
    '140797da-dc94-4fc2-8b0b-f2e1dec7bd43',
    'ec97fa0f-e174-40fe-a6b8-ee240bdf4318',
    '5488d7c1-5195-4ebc-b0f0-31033fa06dc9',
    '56f9888f-e623-4a1a-b2b4-46378a6cd6fe',
    'b2e7f15b-65d5-4812-abfd-b2dbc6d18850',
    '69f8d7c2-2e14-48e6-b838-7881016313fb',
    '809fa952-3178-46b7-bb82-8a476ef32e67',
    'dfb14a1f-38b2-4668-b98f-59b5b5b53ca3',
    'f553b765-1c9c-464c-a8fa-07700a1691c5',
    'b108dfd0-711d-4bc1-aab5-1b312226c8ad',
    '38eca26c-d79e-4447-99d3-1889d20ade21',
    'd5a0e24d-689a-4854-92c7-9a39f980b523',
    'ecd0606f-4fa7-4e57-b6e8-eea377e65d5b',
    '0c6bc810-5ae0-4926-a56b-2bc2fe7dbe6d',
    '9dc31133-c882-4f39-903a-a25f316bb560',
    '23a0ede0-4f97-46af-9f04-bd2805050980',
    '7593c1c2-3680-4bf5-8a65-dce5f96a3b59',
    '02661394-2886-4ef7-aff1-d53225c82025',
    'c1644d4e-06e2-4fa8-95f1-5c1da5831257',
    '267cf516-dd33-4640-a71a-78bd8f5db9d8',
    'df06ff22-6a2d-4934-aac9-c8368efbea1a',
    'dcdefb14-f6de-4c46-ac7e-842b273416bf',
    '28208cc6-50bf-4864-9a48-981632066640',
    'ccfe264d-a35e-44f8-9b2b-241a0f8327cc',
    'f177843d-47fb-46ae-83be-73c92ee85081',
    'd049d487-1a69-4358-8dad-0e6fa6c06fdd',
    'da29226b-e856-4014-8c8b-c4268d0df2cc',
    '714dcdbd-8d17-40b0-8246-0e941af8175d',
    '53ce7d12-facc-4412-a710-f535efb209a2',
    'b9842819-5fd6-40c5-9668-aae1ea44a308',
    '7c056125-3ed8-459c-b73e-edfa3f80cc27',
    'ed1215f1-787c-40f5-9d77-4b5bc2dfbb84',
    '7feeb2b1-1926-4968-b6dc-e0a1e4cf8d4a',
    'dbf5d87b-4cab-44cd-a792-1d0218aad973',
    '85dbde76-c130-40b2-8a8a-ba815ba499da',
    '1c2fe11d-b020-4c54-8c71-1ea91623d626',
    '84133066-68cd-41fa-819b-d74a3ac85862',
    '54c6fa73-9b84-4a3b-9e97-e4e43165c48b',
    '90ba1853-f845-4502-ba36-b75b9e571bc5',
    'f3c88c3c-8e1b-4af9-9467-0621404e314c',
    '332bb145-6ef1-40ef-932c-aec5bb6210d9',
    '44f83f20-d618-40b5-b2cb-3676b8fe3ad7',
    '3c2c39a9-4cc2-4f7c-89e0-054a871e2c4e',
    '2ebb722f-a3df-4ea4-b72a-813e3db0bab5',
    '1817528a-4f88-4ed6-8965-9eae0220ab27',
    'e03eb011-05f9-4491-b779-0cc2aefabff1',
    'd4bb7169-5a7d-4090-ba62-12ea799c3ade',
    '3f172982-060d-4339-a09b-6994c2c9eb16',
    '51789659-5233-4ee7-8bca-dedebfc87773',
    'a5fe75bb-d28c-42fb-aaf8-92fa37b266d2',
    '6545d602-e5b4-4dd1-8f6a-64e0a1952ddc',
    'e5c79b74-20d5-4b6f-8085-0bc788eed2ea',
    'f492567d-6db8-45c8-b44e-6b5def26c812',
    '12e54f96-00d9-4a38-921e-e0d42610b2c1',
    '845b131e-7c05-4397-ad40-23dea8e9b399',
    'bd492b71-b20e-4056-b8ae-ad8c94cfbc02',
    '9ecc231f-e3d3-4417-a98a-c4db4c638161',
    '3fb2d04a-d18b-4bdc-9372-99b992f2ae42',
    'e922a496-e686-4fa1-911d-2159ceb0f09f',
    '8fbfea50-6a71-4b19-98e9-f95e3a8594c7',
    'd911e57a-ebb8-4be8-876b-d8e5790ddce3',
    '6c9423a2-3ea7-4c3c-9b12-0cc993bc095f',
    '52e015b5-22b7-4a96-9f0a-ea3afccbfcbc',
    '325f3ee8-2adb-4092-bd78-1b5ea5b0d1d6',
    'ae50ef98-ef3d-4427-b094-83b2d90787a0',
    'e6771964-50e9-482f-9d23-18c22cd89ab8',
    '3fbacc64-4c53-4770-8cdf-a616c10ec5c7',
    'a3ae33bb-8b3a-47e5-a2d1-a49c954776b3',
    '0b0a52bb-a1a2-4638-9259-4447761c2da4',
    '0eb42259-7b44-450f-a9d7-500b2ea7179c',
    'e16adabb-88e0-4739-983a-98ac5c181842',
    '71f94dff-fbe8-4881-af1f-4987b67d5181',
    '5627cdbb-22a0-436f-a7a4-34d7ce21bb45',
    '0e65b131-fd14-4fce-908b-c5b89a71a9c1',
    'd56ae233-d6d2-483c-917e-1de0fe1cfeb7',
    'd00353de-f6f9-42d9-8a8f-f88b3d880dbf',
    '3be57453-9325-4c2e-b73a-832139b61778',
    'fcc60ac9-0d20-4a7c-97e4-e3c8d3aa8f76',
    'dc5f85d8-333b-4b68-b160-ad9856233887',
    '655e6a61-5400-4d8a-95bc-1506e026b289',
    '1f2d14d4-1bd8-46fc-9d35-1a415e5f326a',
    '64fd39fc-b32e-4b0a-8f83-4bf11b197462',
    'a77a2c65-38fe-4bf7-9ea6-0a2dc65eb21f',
    '25248cd8-2e98-4a83-9ccf-af7214fa71d6',
    '158ebecd-4596-4541-b832-a137232b7036',
    '1ccb95c3-1901-428e-b7bb-34495f41f4d2',
    '02ff1051-cd1d-4bbb-a005-21384cbff846',
    '0144b0d3-a809-46df-8c67-7ce42bdd579a',
    '35a1009d-93a2-49b1-a801-fe84d6b7a2f5',
    '50132478-c9fb-4dc5-86cd-d5dfab909393',
    '35064fc1-6c52-4005-8e99-cb0d6afd3f8c',
    '5cf859f6-990c-4b04-8609-35d5c57920f0',
    '62cfdce6-2d4d-415c-a11e-5ab60131c668',
    '2d07dd45-a263-440d-a339-9ccbab93aba8',
    'f1513955-0264-4733-bd25-3f752c61a323',
    '93e712f2-3e54-466e-aa53-57eb69c43bc0',
    '296f653a-91a8-4139-9bab-e6ae13afe99c',
    '633dc1aa-084d-43bd-9b17-bc6e57f81d48',
    '9320b3b5-3944-4bd2-913a-23b72bccd86c',
    '86ab4d3b-86ce-422b-ae6f-1ec6968a874d',
    '4124010f-7308-4831-80d7-ea14343249ab',
    '3037caeb-fa7a-4924-b399-7e4c7173b3b8',
    '146b72bc-1dcc-4e3b-bcda-d3dd25418012',
    '1d575e14-c3b1-4ead-a63b-a21c08c6a14d',
    'c5b1e333-7203-41ce-b8f7-3ef3a3bd721f',
    'bf519ea2-afe1-486a-9954-7362f10b6b60',
    '254ffffc-2bd6-4b2e-905b-a8c54c348cd0',
    'bb65d291-a673-4e4d-8a37-ab1f7401a902',
    '3a9604d7-456a-453d-a46b-40408624a07e',
    '2cbe079d-e7ab-47d8-836e-454a71440297',
    '84fad495-2756-472f-ad20-f91de6f67baf',
    '28e73469-12d4-493b-bf6f-83359c1f69c5',
    'db266afc-2f75-4b03-a3b8-c69e0ce6f713',
    'dd6866e4-8949-45bd-8910-8ce64f79e3c7',
    '12bbfa4c-c30a-4cf6-b79f-45354f842964',
    '84ac0d05-4be5-43e9-973e-ef999144d802',
    'ff8b1212-858a-4048-8f63-9464c922591a',
    'c814d754-cdc5-4b0d-8671-a39e85b2c473',
    '797b2563-5d56-4f5c-bdaf-3bfd11e8f5b3',
    '85287d84-fefe-40df-ad40-5b135ee0c07f',
    '7eeede5a-c86f-4577-9f3c-65ab618a6dee',
    'a52c04ee-cfef-46bb-9b40-6a9b292e1a7b',
    '0194eea9-d779-4957-8521-11717a378e66',
    '0d82658c-44b3-4cea-a388-3353a96a31ef',
    '77dca0d5-4d22-4415-8858-075590d25cb5',
    '1b05159b-6277-4345-9d59-f7bba5ea1d56',
    '92299ff4-c0d0-4e94-b374-75d0038cbd68',
    '60f96582-79ad-4461-9f9a-53c1bc3d17b6',
    'cc107de7-d623-464a-a875-c8b7ae5fb09d',
    'b5d7c34a-c383-4fc7-aa4d-b6dc941cd41a',
    '85baa8f8-619c-4165-9d3e-53220f645814',
    '3f278de3-f201-4344-9639-d35cd7a62adb',
    'c423b18d-12f2-43e4-97f9-993e2943270e',
    '416b8daa-9537-46db-ae7b-3f5ff5f01dc3',
    '61940344-e6c1-484e-ba10-131f43a9b13a',
    'ceee2791-0fdf-45fc-a4e8-8077916771aa',
    '5205f817-7de0-48b4-89fc-6398cf13bff7',
    '1d2f5472-ab6a-4a9b-ba53-520858cf79db',
    'f757278a-3c74-4690-bf89-5149d21ff3af',
    '5a103ab3-29c3-4d07-a0f6-4999c256cf26',
    '2a263db0-8c33-4171-840f-54bf4755a4b9',
    '9828f3fe-f676-4bf1-b600-5effa24ea9c8',
    '28849dc9-a97f-469b-b2ac-a8ff97693f02',
    '29cd0578-fb47-495a-8f48-b37325eed81a',
    'ba503d2e-48af-48bb-910a-be41790d921c',
    '472f01ad-7bc3-4fe5-9771-2695930dbc95',
    '956cd931-7077-4a08-9c75-ab8b4e5d1eb8',
    '31a42df0-29f6-4d4e-ae5c-2e13abc355f2',
    '13364604-ed08-4a61-89cd-65eb372ac8c3',
    'b724164c-712c-4615-97b7-529a108a753a',
    '22199347-9454-41e5-8912-eb38edd33a25',
    '60cadee3-9e63-4897-ac81-4fb283033648',
    'dcd4112f-09d3-43ed-8441-df9bf4c9ddc8',
    'da02c3f1-371b-4afe-9b5e-b8c584fd5907',
    'f6565f2f-4478-45ad-8c11-04dd242fc6a9',
    '275ea204-4612-4d3c-ac0d-f110f61d62ad',
    '72f73fc5-6a3a-43a0-8cce-09f4726b736c',
    'be72f1e3-b5f5-43f4-80db-6d7de93a654e',
    'b6bf4699-6f61-4c6a-9d42-ad055a0de008',
    '8abf299c-cd4e-4ce0-b5cf-4f9abe8cc891',
    '0b6eb077-2eca-4fe6-b012-26fab725b907',
    '5c659e81-e687-4710-a4fd-000ca593155d',
    '42965913-4223-484a-9b3d-abc0002d277d',
    '72e639d1-b8c5-45fd-9acd-a8e5e2b7fa0d',
    'd6823ccd-7247-4efc-8841-f53f456351ed',
    '1048a860-d5ff-4f61-95e5-851e1266d4c1',
    '8681cdcf-b775-4b56-aace-3f3e448261ef',
    'f0db3b27-c952-477d-bc33-9b96a250e168',
    '9e1a6a7d-b45e-4fd2-a1ff-df131da4c713',
    'd596ee91-481c-4eb5-9a8a-88c1e10ba9b6',
    '24470eb6-97c2-4cd4-b484-87a7d634c5b3',
    '3a781e70-cf6e-41c2-8d68-2326f16986e7',
    '048afc84-cdd2-4b39-8ea5-7351f4699761',
    '3fdcdafd-5328-418b-85f8-47b0006de468',
    '032d39fb-d278-427d-b7d2-de648a25a20c',
    '27acea14-41c9-4bf9-ad43-3ebb3ce90456',
    'ec6f49a2-176c-4564-82c5-e751baab46aa',
    '68ea655f-b4a3-43e2-95e4-f158ca2d67dd',
    '0447c960-bbfe-4e42-a95b-dd3d1d9a368e',
    'cc19d19e-6f7e-41b8-87a6-77f41d53e650',
    '16031a34-f1ba-4bde-af43-1822f1516944',
    '483d3454-54da-4243-bbeb-98cbf1d088d0',
    '6765ce2d-ebc8-4367-8855-c0f8e62cb355',
    '1b4a324a-5621-4399-85d2-f91aa03418b5',
    '09642596-d33a-4261-8bf7-eb1dbb37d572',
    '75119ed5-b8aa-4f45-bdef-e3c673bbe44c',
    '04a874df-c57b-40fc-9139-bc3a05129115',
    '2b8ad26a-e66e-4b03-a65a-5b504cecacfd',
    'a7226f10-bdba-4284-97b3-0738a5912770',
    '93b2ac60-2208-4ef8-a1c2-68a623e45807',
    'a963c15d-9c97-49e4-af95-cdee96333a76',
    'fb5d9952-ebe7-4ee6-ba00-819ed00f3593',
    '8da05494-fe7a-4af5-b257-bada143ee426',
    '8b098ab4-df02-4619-8ded-657e496695c1',
    'd48adc59-8934-41bb-9720-63e71f1933be',
    '8e88cabc-e713-44ed-a5d2-41935c3b4eb5',
    'be8cfc23-cd19-46fb-92e1-a77ac380d7aa',
    'f9224ea2-dd31-421d-80d4-f35082ef8d68',
    'e2a398ff-18c3-4258-9d75-89adb2923e88',
    'cb8ebcd0-bb5e-4a6d-bfef-5c651a1a9f6e',
    '6238f8f7-5efb-4023-8d85-ef7db9b4dad7',
    '32c09444-3d4a-44d5-af6b-07eef92189db',
    'd7686f98-05a4-45c9-af2e-3ebc524a5b2d',
    '1939b7ae-fc6b-42a8-ad5f-dc51a1682a17',
    '8ccefc59-38a5-476f-b7d3-3f98315a97f0',
    '2cda53ba-b852-47e8-8f24-59ab8e9f1d1f',
    '6e67e1e1-5c39-43da-960f-48385789c4e1',
    '92382848-f5e9-426c-b7dc-f2841ae97018',
    '4999a410-990e-484b-b4f3-d636f894a741',
    '1f534eb4-701f-4182-9895-64c5e5b52d82',
    'd01a4268-1bfe-4a2d-a2d4-e296162c406e',
    'feca4815-b44b-4b2b-8d77-75edd62ba5a6',
    '039dd3d6-0cb5-4cd1-86b3-e9579c9b5218',
    'd0ce8b95-9c3b-4f9e-8ce0-169fd89a8b20',
    '7427b2eb-a84f-413c-bfb0-7d2e36b0628f',
    '9d796a02-e2aa-4c15-b8d6-1e90cd736681',
    '28c3df75-0b08-4d5c-9feb-6e2e918572ea',
    '433e3a09-661a-46a5-96f2-dbb07bdc87f3',
    'f69c21e9-cb5f-4e72-acfe-c54b672a9f3b',
    '2ef4530a-cc36-4f32-9a1a-63a555346587',
    'e917c83d-c482-442d-81ce-869de7d20903',
    '10774229-1487-4188-b2c3-1fabcf85492a',
    '65793118-3c88-4185-9172-2354850e6056',
    'fcb03f4f-e685-4803-aadb-0e8940ff4f37',
    '46536136-08e4-4521-8e6c-67f023de020d',
    'c2f0e7cf-ac07-48f7-b5f1-497ee6c134b2',
    'b8e7fe18-9c3d-4cc0-bbc7-85b27197fc8f',
    '3abfc362-7e73-4663-9dcf-07b78b9aa2d4',
    'b60b4737-c646-4299-85a0-520890e830b7',
    '280c5d6f-39a3-4d1d-aad2-a174451cd9b2',
    '2d434f2c-6aaa-46b2-ada9-de4b887e13d3',
    'bad1fb5c-d263-48d7-8e4c-fa873a17d707',
    'e4ccd185-2b0c-445d-9c57-0dc45c8f9d7e',
    'd48db47e-acba-4377-b031-f6dfc21f3658',
    '3fd2204c-8654-4af7-832f-c186447262e0',
    'ae34e63e-13af-48b8-8b72-8137289091b3',
    'd3ed2595-b8be-40c8-b7b6-10a4997b9d2e',
    '575dc7da-58ed-407d-9e88-7b586f28bf65',
    '20ddfcd5-d456-431b-9f05-781e05d873d6',
    '15d41c35-943c-474b-afa6-e1c6d6e4be2b',
    '61803dc8-f649-43e5-ab15-d351f2cef629',
    'abe58d43-e1c7-4953-aa41-4d3b6f6cca44',
    '3ef7966a-ec1e-4dba-9d31-cdb33692e78f',
    'fa278604-7d85-4491-a30d-15c7821f8b00',
    'dd6c6688-b73a-464c-86d9-3369fdf98268',
    'b5c0bf91-9d20-41a2-9dd2-87d0ef0310f9',
    '97c636f9-0983-481f-8ff9-7b5b3ee6b10e',
    '15b153f5-ed02-4216-8f96-99743b8b4fc3',
    '747858c0-d139-4f52-9f0e-a618b880d6d6',
    '32bc49c6-7583-4613-a72f-5edb12b2a808',
    '3eb8ea77-4605-4bb7-90f9-671953abe4a2',
    '2b08cb76-061d-44c6-a00f-b43a5421df5e',
    'ab7e390a-adc5-4f9e-b317-a216a2904c93',
    'd1e6d0e4-d49e-4a16-93c6-7956b2c03414',
    '713f8676-8034-4827-bccc-cd6d95b1a4c4',
    'b00883d8-9251-435d-aefc-8a703d96d2fb',
    'eefbea02-0d65-441e-b455-35aa21d25ba3',
    'ff7e3be2-c0ac-4d97-85da-6229bf7585ac',
    '7ac92a42-e112-49c3-a8f5-8ad2c7ef5578',
    '0701aae2-8661-4eec-84e0-7c8be1c89a18',
    '7efb1905-34b4-4f1c-a8a6-8e64b3640a68',
    '11a2b088-8c1c-47d2-9c1e-455d457d2f05',
    '74608bd9-39e4-4f48-9b7c-1cd9d3c599c9',
    '7baf8e8c-de11-452d-b2e1-aad7c08cc18e',
    '23b0219d-0820-4017-b942-bda8578e90e2',
    'd7bcfc5d-e258-4bd6-a413-bb7a118e6bff',
    'a5f631ea-2b4b-43f2-9ea0-e31f2b11fa27',
    '8523489a-f57c-4993-81e4-1ed86a5c092d',
    '395da421-e6e8-4a26-ac93-eb7050a7cb1f',
    '9cb5ce25-38e8-4628-9ddf-d6aedf5efe0f',
    '615f6246-1c39-4e44-a9d4-c7133a2ae62d',
    'a647528d-925e-4c02-8825-ff54720c6ee4',
    '2c6f63b2-439e-499f-b687-b3fdd88a492e',
    '68a916af-2e0c-41bd-8535-c7eacbc2d1b7',
    '0e7f31a0-c712-4ebf-ab3a-64c37f43e52a',
    'c8b1d323-f352-482e-bf17-82075c23dcee',
    'd30f51c7-d642-4e7d-a168-967b9520a80a',
    '8d89608c-0d61-4d71-a2e3-9fbc6cda69bf',
    '71219f56-551f-4ad4-9a38-cc4aaf8a1e9a',
    '48dd6010-77dc-465b-a27c-695e29b57a5e',
    '3376a8b6-7ef6-4191-97ab-a547da0d330d',
    '21384132-1697-4e9b-b863-a6492d13285d',
    '582f5f8d-b96f-490e-b417-ba824baeb06c',
    '7e825ee6-7c03-43cc-b0a4-0d9203a30bd9',
    '2843292e-e494-4642-90e0-57e5c153f12c',
    '4ecbb7c8-0246-47f8-9654-4caca1d52565',
    '7e3ea1bd-95ba-4cad-90c8-3eec95be9cc8',
    'bbba696b-d023-4bb1-a213-c8bee31e8bae',
    '00bd45f9-beb2-4fb0-8680-bd30e392975a',
    'f85e467a-958f-4da5-a01b-8df883e69122',
    '470eee0f-2053-4d9b-9f5e-ca9661a6cc16',
    '5c1dc76d-b703-445c-9b38-cc2d00b9ab16',
    '0b25d09e-b2d9-4452-9810-1d0ef777f9d6',
    '608d793e-a78b-4872-a50c-21a9eaa60ec3',
    '74ede771-6781-4980-bfb9-5d853b7cdd6f',
    '6c47e282-5d5e-445c-b6bd-c0024946fbe0',
    '6ac178b7-a923-407f-8cd8-1733e1b2ebd5',
    'ff8ffbcf-c932-48c7-8d5e-d995d5680e21',
    '9d74b4f0-b2d4-46aa-867a-52fb6102bfdf',
    'e34f15f7-c225-4314-a638-90504bb0aa0d',
    'c1d222ab-bc0e-4e13-8379-0ee5be9e140e',
    '582187a5-ad63-4759-9162-55fa6337eb07',
    'd1e8d19a-970d-4ede-b5bc-9cab7237adec',
    'af867604-d801-41cc-9949-017eb30a0cbf',
    'c9dd3578-01db-4687-9807-4f71368941d1',
    '722e332c-fb1a-45fe-80c7-cc670f025b7f',
    '9f152896-ebf1-4756-b678-bdf739a92256',
    '478aa270-fbd4-4a45-8f63-221b4066168e',
    'e9c7ad29-2213-4648-9164-33a07bd42cdb',
    '1d140c76-a06b-42a0-bae8-b9e169ebe394',
    '5edcc3db-c676-412a-9506-600959bb81f2',
    '9f7dbe05-96b5-4b2f-9f3a-34b552e3dd21',
    'ccc524ab-d9ad-467c-a25b-9a14fb05e976',
    '9e3fb02d-dcf6-486f-a42d-89446a852057',
    '4b341ba9-49a5-43a2-9b7e-cc96beb59946',
    '15ae6390-6f6d-4fd8-9a51-ecf988676c4d',
    '3a3100bb-369e-47c1-a77c-2cacb7cf020d',
    '2c11b505-17c8-402e-8422-0239accb449d',
    'a6d7e030-e6c8-4c62-8cb5-165ef54987c4',
    'e25a8172-1e34-442c-a45d-583027a2d734',
    '0c18589c-6432-4a6c-90ce-985a47a66f39',
    'c911503c-f010-4c17-ac57-1d82e954bdc7',
    '487016d8-ea02-4b20-a45f-7382139aa865',
    '677f0bdf-6c5c-462b-8294-3666f777bbc5',
    '34da5c11-bbe8-4e55-8d89-9ef8a1c66200',
    '9a4d9d5f-72aa-4d7d-90f1-6d1181ee984c',
    'e6b15b39-daba-431f-a918-e4e43e702c30',
    'bef62e8a-5f5c-4e81-a8f8-ddeaf657b4e8',
    '128dce74-fa37-4f2f-8a80-d542edd81a11',
    '841970b7-bed0-4a75-a28a-a4cc59740a84',
    'a5f53fc8-8f9b-4e9a-af63-6f8c54d478b2',
    'f461fca1-80b2-4980-83a8-e165d49acc18',
    '37f0f1f9-83fb-49a1-9941-093c068c32d0',
    'cfb3dad7-c6d9-47c0-81b0-2133d75f5c0d',
    'c5c0893f-b254-4038-8d08-b28ef5a26b5d',
    'bbcf8529-1a04-43fc-b6cf-cb161028159d',
    '06421648-dfcb-4460-b93b-c7d6804dddbb',
    'e0b28b59-1cb5-44f4-ab8f-badf5c74f69f',
    '631deea0-2821-4d14-ad02-dc0ce4864924',
    '95788aa7-c897-4ae8-9166-4b8fc1fc5342',
    'eede320a-ed63-41d8-960d-5405a26a194f',
    '36dccf81-6932-43ae-9864-53379832d878',
    '9102024d-58c0-4bb9-aa55-12c00d98b6cd',
    '01eaf423-8cab-491a-b82e-6915dbc73594',
    '0481a135-9db1-424f-9065-a83ebd7ec995',
    'b60876c5-d825-4303-befb-ffff55b92aba',
    '49022563-1be1-4e42-a11c-01743cd5c94d',
    '64f2dbe1-6f58-493d-ab6b-c93568d828f4',
    'a9626803-72c2-4e23-968c-a090e3f22c5e',
    '095728d6-4ea1-4909-8a74-a8f3fa7f86cb',
    '8309cd89-a912-462a-90ad-f13ae0d7aa6c',
    '902596ce-714e-49b3-8271-f3dfece52309',
    'f5f29e4b-68f7-443c-b290-0827d4167fd5',
    '07c3a7f4-1e59-4dcb-a244-2fd3d084e2b0',
    '6f49717f-8f57-42d0-8548-316ecc292415',
    '7e693091-8ae4-4c40-8e66-c3b39f01b90e',
    '544f643d-b19f-4aa0-a6ec-a90e1a8681d6',
    '5137255f-0c58-4ac7-9266-bda8ab0247c2',
    '5243df74-712d-49a8-989b-528d15088e8f',
    '278a26cb-a710-4fff-928e-fc2e7084a75a',
    '58a1d168-8290-4c69-bf01-17ba3a084365',
    '9fc492f3-8d13-47ae-93e9-812c0224f1aa',
    '822d381e-cea0-45bb-8fa0-1b7194b4b64b',
    '2dc01a50-ea7b-4d9b-be57-1ffbdd98b27b',
    '5b6676dc-f46e-43a8-b87c-e431e369e53c',
    'eb35085f-0cbf-4829-a3ad-acaa53a250b5',
    '36fa2d20-622d-4cca-80b0-683672c94170',
    '44161b51-953d-4f6b-9448-5cba4a44a9d8',
    '0b90b2ea-8ca3-406a-9f69-95eddf7699ef',
    'bfb202bc-4078-4df0-82b9-9218dbc1f1a1',
    '8cc59f51-b0df-4a5d-a3c5-83ee526ff1af',
    'fda7c4b9-9f35-482b-9eff-be7f11058d94',
    '85646f4a-e424-4363-8033-1e7522e8f175',
    '8945794e-174a-49f9-a2d4-4242f9bf3833',
    '529343b4-698a-4b36-ac55-db8a6965ad3f',
    '0f949ee9-0986-42b2-af5d-0f4c8338c664',
    'cb1f06fa-b916-477d-8ab6-fb4b3f24efd3',
    '05253b3c-e8a3-4db4-8a6d-014eac7b3d94',
    '4807db90-b0f7-441d-b489-932f9b341f74',
    'c33b1f32-6021-4d1c-a4d5-fc3d501107f4',
    'aa314675-af62-41df-b5cb-3b22558e903b',
    '20741062-7d1d-44b7-bc33-39c9ad26e414',
    '69ce1be3-1815-43a4-bdd2-4696d9c8d09a',
    '76dd508c-aa80-4e54-9ac4-23b5e0545316',
    '1c6bef41-3cfa-46b2-b183-0a523e417457',
    '18716daf-4223-44a9-bba9-fc9baeef7d07',
    '475430c5-28cb-456d-9c5c-bdbfab9fafb2',
    'f0061cb3-688e-4ad4-aeb8-8614282292ec',
    '6905d8d1-da77-4f7c-86e5-3af7db2b00b4',
    '3a89c170-2939-4c12-9940-f32d96fa9e55',
    '9a06c401-da3f-41b4-b38b-238796fcae09',
    'b32d88c8-31e3-4789-a75f-e52bf1272937',
    '2a81cd6f-aa6e-436b-b4ba-68d5f713fb07',
    '5e0e8f9a-ce97-4b18-9540-3015c61e393c',
    '1c8ba244-1c7f-433a-825b-d2d34d018dcf',
    'efcdb584-7659-4780-9d6d-e6599fb0033c',
    '373ff2e8-0f63-4179-a55c-3fe0b85556aa',
    '352a503b-41eb-4a84-b257-68d70e55337e',
    '737d39b8-2f99-4eac-bcda-a03996e08939',
    '7577f264-8e84-440d-9346-7c4d5affda51',
    'febd8561-4769-4f3b-b7c0-ae7ff6ede2e9',
    'b8c5b185-8669-43d1-8ec7-c0f6d223d505',
    '31e61d00-61cc-46f2-a793-8ea8dfbb0832',
    'de1e7762-673f-4d44-8f45-7e693bb338b7',
    '239a484f-67c2-4ba3-a3d0-d6e4c2b27475',
    'af6c6f09-f0d2-46fe-bda0-c6fa5901c4a3',
    'c6f3bd64-ea67-488f-904f-f0bdf6320b5c',
    '166746e8-ce26-4fa1-a587-443ca9fc59a1',
    '6d18aafc-0240-499c-902e-a72a5b98ff0a',
    '263ab7c9-bd69-45dd-abb7-bbf35b9786ed',
    'e0c7877e-75d7-47d8-b5e9-5dd677d03353',
    '49a97523-0a7a-4d5a-ae20-496f86de2032',
    '583023a1-aa12-40e2-a964-8ad50ad400ba',
    'c56f0a76-2b91-4860-8dff-63c9504bb0e2',
    '17d3ffb4-e891-4ac6-a91a-fd52971c1115',
    '5203f051-7e84-4969-b4ce-eda56a859793',
    '488a38ee-f996-482d-a562-a4474f5594de',
    '680d748c-7c60-46e2-aea5-7fc557a916ea',
    '462d992a-7c13-45ac-a6da-1254fc3a9031',
    'dbb4df81-9115-45d1-b51d-875e0669edc4',
    '6fd0f009-3c34-4529-9a38-c59745545490',
    '74d1e549-5ae8-4410-9428-f8f2cc85fa80',
    '868f72af-99e8-406e-9f7e-14577e6c7157',
    '73f7d2b4-86ec-4f7e-a1f9-37c7b023e3bf',
    '595b6755-e7ae-4e83-af2e-693c089aeec3',
    '4d01e12e-503e-4447-8e49-8c2b77ffb00d',
    '732eaae3-b509-4a7a-8961-09d861e55253',
    '279e5670-8a47-4992-bb10-14e6c719db97',
    'd306000b-88c1-4220-8d7e-933c0118a983',
    '703c4bc2-81bf-435a-87fa-21dc9278bad6',
    'ab76b5ca-e464-4063-b949-853f61036370',
    '672b617f-936e-440a-a735-80f94798aed1',
    '516ceb43-1378-4c02-88fc-a1d2a2258d59',
    'c9986260-0c1b-4fd3-8132-6fa7353046e6',
    'bda2bec8-a142-47ab-bfb4-83759ac2bddd',
    'e858d4f9-3385-4640-b0cb-4894e86d501c',
    '39fe0c8b-bd78-4565-9415-63eabc1d6d85',
    'da4e904f-0346-4cd3-a5c2-ba932511d98d',
    'c46c2220-da88-4f60-a0cf-eebfd0a8ff12',
    '629e31cb-dd7b-4345-abf2-fa23c6c65a09',
    'e9a57082-5a93-481a-bbd0-1acb03ac751a',
    '0faf149d-b316-4fbd-8605-a59354f0eacd',
    '1d23d3cc-5db6-4734-bfaa-507dd366d99b',
    'dd58f556-0049-49c3-9a51-d6470a2abddc',
    '53185d06-f2cc-4942-88c4-8534b559a9ff',
    '0132f320-830d-40d0-a4da-06a5d5f9e8d9',
    'bb7d6408-941a-4da6-8613-36498bc6d91b',
    'b8d11ca1-3db8-4efa-bf57-0305e004a26d',
    '9bd56ad6-080e-4d26-acca-83e4df8aa913',
    '00c11c7e-8530-4bfc-abd7-8c10f4c602d3',
    '8f6b9e20-9468-4f46-aa45-eeab9de88e53',
    '272dff18-acf3-4874-a55f-ba8fb6f80352',
    '70ac3659-06bb-4022-be55-af81d3e35b6f',
    '409b92cf-5c4d-4997-9736-ef2ea10d19e9',
    'a8636719-e26c-49b6-9a53-7d77f3d3c94b',
    'a3e81d5f-8dd6-43dd-9172-d80d212efa2d',
    'd40af129-c13f-45b2-92f0-d0e8fa5cc1c9',
    'ecd2d2f9-2b6f-4743-8d04-c9bb554a96cb',
    '9ee78822-7acd-4fab-9999-c58e9fe266ad',
]
use_raw_data = True

# Establish API client
creds, project = google.auth.default()
auth_req = google.auth.transport.requests.Request()
creds.refresh(auth_req)
config = data_repo_client.Configuration()
config.host = "https://data.terra.bio"
config.access_token = creds.token
api_client = data_repo_client.ApiClient(configuration=config)
api_client.client_side_validation = False
datasets_api = data_repo_client.DatasetsApi(api_client=api_client)

# Loop through enumerated datasets and create records for those related to AnVIL
print(f"Start time: {datetime.datetime.now()}")
file_format_count_dict = {}
file_format_source_dict = {}
file_format_details = []
datasets_list = datasets_api.enumerate_datasets(limit=2000)
dataset_list_len = min(len(datasets_list.items), len(dataset_id_list))
records_list = []
dataset_count = 0
for dataset_entry in datasets_list.items:
    if len(dataset_id_list) == 0 or dataset_entry.id in dataset_id_list:
        dataset_count += 1
        logging.info(f"Processing dataset {dataset_count} of {dataset_list_len}")
        if dataset_entry.default_profile_id == billing_profile:
            # Retrieve dataset details and pull source workspace(s)
            dataset_details = datasets_api.retrieve_dataset(id=dataset_entry.id, include=["ACCESS_INFORMATION", "PROPERTIES"]).to_dict()
            bq_project = dataset_details["access_information"]["big_query"]["project_id"]
            bq_schema = dataset_details["access_information"]["big_query"]["dataset_name"]
            client = bigquery.Client()
            if use_raw_data:
                file_ext_query = """SELECT full_extension AS file_format, COUNT(*) AS file_count FROM `{project}.{schema}.file_inventory` GROUP BY full_extension""".format(project = bq_project, schema = bq_schema)
            else:
                file_ext_query = """SELECT file_format, COUNT(*) AS file_count FROM `{project}.{schema}.anvil_file` GROUP BY file_format""".format(project = bq_project, schema = bq_schema)
            try:
                df_output = client.query(file_ext_query).result().to_dataframe()
                for i in range(0, len(df_output)):
                    file_format = df_output["file_format"].values[i]
                    file_count = df_output["file_count"].values[i]
                    if file_format_count_dict.get(file_format) == None:
                        file_format_count_dict[file_format] = file_count
                        file_format_source_dict[file_format] = [dataset_entry.name]
                    else:
                        file_format_count_dict[file_format] = file_format_count_dict.get(file_format) + file_count
                        dataset_list = file_format_source_dict.get(file_format)
                        dataset_list.append(dataset_entry.name)
                        file_format_source_dict[file_format] = dataset_list
                    file_format_details.append([dataset_entry.id, dataset_entry.name, file_format, file_count])
            except:
                pass   
    
# Build output records
records = []
for key, val in file_format_count_dict.items():
    records.append([key, val])
df = pd.DataFrame(records, columns =["File Format", "File Count"])
df_sorted = df.sort_values(["File Count", "File Format"], ascending=[False, True], ignore_index=True)
ds_records = []
for key, val in file_format_source_dict.items():
    ds_records.append([key, len(val)])
ds_df = pd.DataFrame(ds_records, columns =["File Format", "Dataset Count"])
ds_df_sorted = ds_df.sort_values(["Dataset Count", "File Format"], ascending=[False, True], ignore_index=True)
df_details = pd.DataFrame(file_format_details, columns =["Dataset ID", "Dataset Name", "File Format", "File Count"])
df_details_sorted = df_details.sort_values(["Dataset Name", "File Count"], ascending=[True, False], ignore_index=True)
df_details_path = "file_format_values.tsv"
df_details_sorted.to_csv(df_details_path, index=False, sep="\t")
!gsutil cp $df_details_path $ws_bucket/ingest_pipeline/resources/search/ 2> stdout
print(f"End time: {datetime.datetime.now()}")
print("File Format Record Counts:")
display(df_sorted)
print("File Format Dataset Counts:")
display(ds_df_sorted)


In [None]:
file_format_source_dict

## Validate file extensions

In [None]:
def validate_file_extensions(dataset_id):
    
    # Retrieve dataset information
    src_schema_dict = {}
    api_client = utils.refresh_tdr_api_client()
    datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
    try:
        response = datasets_api.retrieve_dataset(id=dataset_id, include=["SCHEMA", "ACCESS_INFORMATION"]).to_dict()
        src_schema_dict["tables"] = response["schema"]["tables"]
        bq_project = response["access_information"]["big_query"]["project_id"]
        bq_schema = response["access_information"]["big_query"]["dataset_name"]
    except Exception as e:
        return "Failure - Issue Retrieving Dataset Info"
    
    # Check files with improper file extensions
    client = bigquery.Client()
    file_query = """
        SELECT COUNT(*) AS file_count, SUM(CASE WHEN file_name LIKE '%'||file_format||'%' THEN 1 ELSE 0 END) AS match_file_count
        FROM `{project}.{schema}.anvil_file`
        WHERE file_format IS NOT NULL
        """.format(project=bq_project, schema = bq_schema)
    try:
        df = client.query(file_query).result().to_dataframe()
        if not df.empty:
            file_count = df["file_count"].values[0]
            match_file_count = df["match_file_count"].values[0]
            mismatch_file_count = file_count - match_file_count
            if mismatch_file_count > 0:
                return f"Failure - {mismatch_file_count} files have extensions that don't match the file name"
    except Exception as e:
        return "Failure - Issue Validating File Extensions"
    return "Success"

# Loop through datasets and validate file extensions
dataset_id_list = [
    '8fbfea50-6a71-4b19-98e9-f95e3a8594c7',
    '5627cdbb-22a0-436f-a7a4-34d7ce21bb45',
    '179eb85e-2557-4677-9cba-d763310f3df9',
    'bad1fb5c-d263-48d7-8e4c-fa873a17d707',
    'd239dd7b-8d10-4960-aa91-8f8ede641e25',
    '2d434f2c-6aaa-46b2-ada9-de4b887e13d3',
    'd6518df9-fc11-46ed-9c12-b9782d3829a0',
]
results = []
for dataset_id in dataset_id_list:
    logging.info(f"Validating dataset_id: {dataset_id}")
    status = validate_file_extensions(dataset_id) 
    results.append([dataset_id, status])
    results_df = pd.DataFrame(results, columns = ["dataset_id", "validation_status"])
display(results_df)

# Pull Search Facet Values Across Datasets

## Pull from existing populated tables

In [None]:
# Input parameters
nonarray_field_list = [
    "anvil_biosample.biosample_type",
    "anvil_biosample.anatomical_site",
    "anvil_donor.organism_type",
    "anvil_donor.phenotypic_sex",
    "anvil_file.file_format",
]
array_field_list = [
    "anvil_dataset.consent_group",
    "anvil_dataset.data_modality", 
    "anvil_dataset.registered_identifier", 
    "anvil_diagnosis.disease", 
    "anvil_donor.reported_ethnicity",
]
dataset_id_list = [
    'cefc1a79-446c-40d2-b140-ba8d8b1c0712',
    '4e699ead-bbb5-460d-9b32-2b1b322c601b',
    'cefc1a79-446c-40d2-b140-ba8d8b1c0712',
    '2355554e-8951-4b41-bcd8-32e18cddb7c9',
    'bafbf771-1cd2-44fc-9b38-5a4bbead8ab2',
    'a36eeaf7-d6dd-4887-bdbd-e435a07ba156',
    '8de6dae2-55ff-4287-9b75-5b2a950c1f44',
    'ce6692aa-0f97-48fa-8628-b8fa3eab4726',
    '31433635-91d4-431d-8d26-bc54e84c8e8c',
    '0b06619d-39d9-4437-8c42-2e415faa634c',
    '12ffb586-5f6a-4f0a-a353-d2f34599f4cc',
    'e642bca0-52fb-4ab3-ab3a-acaab83deda7',
    'b7fb531e-25a4-427c-9679-b7bdc3d03535',
    '3615e063-f24b-47f7-87cb-430e8aca8d0c',
    'a3ea4f97-6657-4d3c-9be6-96f097f5c952',
    '9f4ac69c-0919-4ac1-98a8-976ed79ace03',
    '96461004-f4b3-4f82-a842-293b3ec46a60',
    '841b7883-9447-4ea0-ae4a-84ea0240d919',
    'd0fc3d6a-c3f4-4533-8a23-817a4e27f9be',
    '2cace5dc-f660-45d4-b689-c4c89e77697c',
    'd2272f2d-c606-4027-b8ea-0bdd6d9d6535',
    'b12fb9be-2ce0-4bfd-8503-732fabba06ab',
    '9f9fc99a-b867-49a9-a3dc-8a39efbd5fa2',
    'ce58654d-b7d3-466b-99ba-b203d527a543',
    '179eb85e-2557-4677-9cba-d763310f3df9',
    'cba804c9-0bdd-4219-a53e-98c8db6334a0',
    'd239dd7b-8d10-4960-aa91-8f8ede641e25',
    '5c6a1c4f-ccd3-48a8-ac00-e18e5ecaa0bb',
    '19e2c8ab-853a-4204-86c3-f591125fbf63',
    '7cf0d3d0-f79b-4bfe-bfc8-e4e6c33dd4c3',
    '3a72e4b8-afb4-4299-98ec-a9ba9606be06',
    'f3c89298-0dd2-40da-8627-3baea553b34a',
    '9a32e23e-840d-4ba3-8cd9-392f48b8e9d2',
    'c5d967fd-09ce-4b02-97dd-ac3abf6f79fa',
    '173e56f7-b813-4c41-89ff-09a824e1407f',
    '80312f74-bd56-4938-96ba-e9bed95d1f3b',
    '017445d7-d56e-4e2e-b480-b4879b51e944',
    '13b2076a-cfe1-49ec-ac61-bad1af9a52ea',
    '175dd803-02c7-4823-81d5-9e0621652ace',
    '15492baa-05ed-47bc-b50c-e587679ae51a',
    '33705ce9-b2b3-4edc-9b47-f54283e193cf',
    '92486440-3a46-44dd-b853-b300ef75b31e',
    'dd2cb8fc-42a6-482f-898e-ef6125feccb8',
    '4e99b8e1-40b9-4fb2-90a0-d85e926ef31e',
    '128332b6-5060-4ec4-b6a6-f53b54a810be',
    '06f05f58-3c83-4f5c-bddd-bed7d2d1d147',
    '51e9935f-ec18-4832-801a-6d9186537572',
    'd6291444-8c3c-470c-b28c-7cf1d5c7aad8',
    '41cb9f29-4ba6-4690-821c-cb085e6b0f2f',
    'e68d1d39-99df-4cd7-8053-1b298f03eabb',
    'b252e3ac-4a8c-48e0-9999-5ee0c9a5842d',
    '7ea006d9-1e19-4678-b2e6-d4a1ea327f74',
    '34fd3b22-ac73-47d2-8849-5877158ec072',
    'a08dc7a6-f8ce-4205-95d2-83f614c2c32f',
    '577f36fe-8154-4c82-ac87-b2a64cb68f35',
    '7ce3270e-b2f2-47f4-a288-639751b2f87f',
    '36bdd59f-4f5b-43cd-8d34-a21ef87bbf30',
    '41d12dc1-8718-4439-b409-26cc23573107',
    'c4c49fcd-0c20-4cff-841a-cb58f5689c5b',
    '6b40557c-ddc3-4e7e-8a45-1761e7fcb8b5',
    'd6518df9-fc11-46ed-9c12-b9782d3829a0',
    '9ee2a552-89f8-4a48-9c94-9fa26ebb7483',
    '425412ba-894a-4824-acb8-bf18fe4576e0',
    'f22bd762-5c45-453e-bf22-b174514abb84',
    '0ee62643-b064-42f8-9b09-5d10eacd70a3',
    '1a7f6728-5116-4f24-897a-59a7f322cfd2',
    'c37b388c-7107-43d6-bee6-4e82b40ed271',
    'bf6f1d78-6a0d-4afb-aea6-17a3c34340db',
    'a3becdde-018b-46f0-adea-d587076eef4a',
    'a9ad3a05-24fb-4e59-85b0-ee09e55a4492',
    '719f7581-21db-4aec-8c46-4a5811832710',
    '3725b660-1106-4173-9c4b-0a15926becf5',
    '318a75f4-ac50-4944-81b0-70a1323e7497',
    '75fb0984-2124-444f-881b-30a1a6f8b8f7',
    '15be288e-53e1-41cb-8d20-8ea87efb9258',
    '700303c2-fcef-48a5-9900-096bf34e2d83',
    '38fd20ce-affd-4791-9810-7f5a7fe876d0',
    '8b8185d3-ba5c-4832-af23-3ff8ca6ed016',
    '140797da-dc94-4fc2-8b0b-f2e1dec7bd43',
    'ec97fa0f-e174-40fe-a6b8-ee240bdf4318',
    '5488d7c1-5195-4ebc-b0f0-31033fa06dc9',
    '56f9888f-e623-4a1a-b2b4-46378a6cd6fe',
    'b2e7f15b-65d5-4812-abfd-b2dbc6d18850',
    '69f8d7c2-2e14-48e6-b838-7881016313fb',
    '809fa952-3178-46b7-bb82-8a476ef32e67',
    'dfb14a1f-38b2-4668-b98f-59b5b5b53ca3',
    'f553b765-1c9c-464c-a8fa-07700a1691c5',
    'b108dfd0-711d-4bc1-aab5-1b312226c8ad',
    '38eca26c-d79e-4447-99d3-1889d20ade21',
    'd5a0e24d-689a-4854-92c7-9a39f980b523',
    'ecd0606f-4fa7-4e57-b6e8-eea377e65d5b',
    '0c6bc810-5ae0-4926-a56b-2bc2fe7dbe6d',
    '9dc31133-c882-4f39-903a-a25f316bb560',
    '23a0ede0-4f97-46af-9f04-bd2805050980',
    '7593c1c2-3680-4bf5-8a65-dce5f96a3b59',
    '02661394-2886-4ef7-aff1-d53225c82025',
    'c1644d4e-06e2-4fa8-95f1-5c1da5831257',
    '267cf516-dd33-4640-a71a-78bd8f5db9d8',
    'df06ff22-6a2d-4934-aac9-c8368efbea1a',
    'dcdefb14-f6de-4c46-ac7e-842b273416bf',
    '28208cc6-50bf-4864-9a48-981632066640',
    'ccfe264d-a35e-44f8-9b2b-241a0f8327cc',
    'f177843d-47fb-46ae-83be-73c92ee85081',
    'd049d487-1a69-4358-8dad-0e6fa6c06fdd',
    'da29226b-e856-4014-8c8b-c4268d0df2cc',
    '714dcdbd-8d17-40b0-8246-0e941af8175d',
    '53ce7d12-facc-4412-a710-f535efb209a2',
    'b9842819-5fd6-40c5-9668-aae1ea44a308',
    '7c056125-3ed8-459c-b73e-edfa3f80cc27',
    'ed1215f1-787c-40f5-9d77-4b5bc2dfbb84',
    '7feeb2b1-1926-4968-b6dc-e0a1e4cf8d4a',
    'dbf5d87b-4cab-44cd-a792-1d0218aad973',
    '85dbde76-c130-40b2-8a8a-ba815ba499da',
    '1c2fe11d-b020-4c54-8c71-1ea91623d626',
    '84133066-68cd-41fa-819b-d74a3ac85862',
    '54c6fa73-9b84-4a3b-9e97-e4e43165c48b',
    '90ba1853-f845-4502-ba36-b75b9e571bc5',
    'f3c88c3c-8e1b-4af9-9467-0621404e314c',
    '332bb145-6ef1-40ef-932c-aec5bb6210d9',
    '44f83f20-d618-40b5-b2cb-3676b8fe3ad7',
    '3c2c39a9-4cc2-4f7c-89e0-054a871e2c4e',
    '2ebb722f-a3df-4ea4-b72a-813e3db0bab5',
    '1817528a-4f88-4ed6-8965-9eae0220ab27',
    'e03eb011-05f9-4491-b779-0cc2aefabff1',
    'd4bb7169-5a7d-4090-ba62-12ea799c3ade',
    '3f172982-060d-4339-a09b-6994c2c9eb16',
    '51789659-5233-4ee7-8bca-dedebfc87773',
    'a5fe75bb-d28c-42fb-aaf8-92fa37b266d2',
    '6545d602-e5b4-4dd1-8f6a-64e0a1952ddc',
    'e5c79b74-20d5-4b6f-8085-0bc788eed2ea',
    'f492567d-6db8-45c8-b44e-6b5def26c812',
    '12e54f96-00d9-4a38-921e-e0d42610b2c1',
    '845b131e-7c05-4397-ad40-23dea8e9b399',
    'bd492b71-b20e-4056-b8ae-ad8c94cfbc02',
    '9ecc231f-e3d3-4417-a98a-c4db4c638161',
    '3fb2d04a-d18b-4bdc-9372-99b992f2ae42',
    'e922a496-e686-4fa1-911d-2159ceb0f09f',
    '8fbfea50-6a71-4b19-98e9-f95e3a8594c7',
    'd911e57a-ebb8-4be8-876b-d8e5790ddce3',
    '6c9423a2-3ea7-4c3c-9b12-0cc993bc095f',
    '52e015b5-22b7-4a96-9f0a-ea3afccbfcbc',
    '325f3ee8-2adb-4092-bd78-1b5ea5b0d1d6',
    'ae50ef98-ef3d-4427-b094-83b2d90787a0',
    'e6771964-50e9-482f-9d23-18c22cd89ab8',
    '3fbacc64-4c53-4770-8cdf-a616c10ec5c7',
    'a3ae33bb-8b3a-47e5-a2d1-a49c954776b3',
    '0b0a52bb-a1a2-4638-9259-4447761c2da4',
    '0eb42259-7b44-450f-a9d7-500b2ea7179c',
    'e16adabb-88e0-4739-983a-98ac5c181842',
    '71f94dff-fbe8-4881-af1f-4987b67d5181',
    '5627cdbb-22a0-436f-a7a4-34d7ce21bb45',
    '0e65b131-fd14-4fce-908b-c5b89a71a9c1',
    'd56ae233-d6d2-483c-917e-1de0fe1cfeb7',
    'd00353de-f6f9-42d9-8a8f-f88b3d880dbf',
    '3be57453-9325-4c2e-b73a-832139b61778',
    'fcc60ac9-0d20-4a7c-97e4-e3c8d3aa8f76',
    'dc5f85d8-333b-4b68-b160-ad9856233887',
    '655e6a61-5400-4d8a-95bc-1506e026b289',
    '1f2d14d4-1bd8-46fc-9d35-1a415e5f326a',
    '64fd39fc-b32e-4b0a-8f83-4bf11b197462',
    'a77a2c65-38fe-4bf7-9ea6-0a2dc65eb21f',
    '25248cd8-2e98-4a83-9ccf-af7214fa71d6',
    '158ebecd-4596-4541-b832-a137232b7036',
    '1ccb95c3-1901-428e-b7bb-34495f41f4d2',
    '02ff1051-cd1d-4bbb-a005-21384cbff846',
    '0144b0d3-a809-46df-8c67-7ce42bdd579a',
    '35a1009d-93a2-49b1-a801-fe84d6b7a2f5',
    '50132478-c9fb-4dc5-86cd-d5dfab909393',
    '35064fc1-6c52-4005-8e99-cb0d6afd3f8c',
    '5cf859f6-990c-4b04-8609-35d5c57920f0',
    '62cfdce6-2d4d-415c-a11e-5ab60131c668',
    '2d07dd45-a263-440d-a339-9ccbab93aba8',
    'f1513955-0264-4733-bd25-3f752c61a323',
    '93e712f2-3e54-466e-aa53-57eb69c43bc0',
    '296f653a-91a8-4139-9bab-e6ae13afe99c',
    '633dc1aa-084d-43bd-9b17-bc6e57f81d48',
    '9320b3b5-3944-4bd2-913a-23b72bccd86c',
    '86ab4d3b-86ce-422b-ae6f-1ec6968a874d',
    '4124010f-7308-4831-80d7-ea14343249ab',
    '3037caeb-fa7a-4924-b399-7e4c7173b3b8',
    '146b72bc-1dcc-4e3b-bcda-d3dd25418012',
    '1d575e14-c3b1-4ead-a63b-a21c08c6a14d',
    'c5b1e333-7203-41ce-b8f7-3ef3a3bd721f',
    'bf519ea2-afe1-486a-9954-7362f10b6b60',
    '254ffffc-2bd6-4b2e-905b-a8c54c348cd0',
    'bb65d291-a673-4e4d-8a37-ab1f7401a902',
    '3a9604d7-456a-453d-a46b-40408624a07e',
    '2cbe079d-e7ab-47d8-836e-454a71440297',
    '84fad495-2756-472f-ad20-f91de6f67baf',
    '28e73469-12d4-493b-bf6f-83359c1f69c5',
    'db266afc-2f75-4b03-a3b8-c69e0ce6f713',
    'dd6866e4-8949-45bd-8910-8ce64f79e3c7',
    '12bbfa4c-c30a-4cf6-b79f-45354f842964',
    '84ac0d05-4be5-43e9-973e-ef999144d802',
    'ff8b1212-858a-4048-8f63-9464c922591a',
    'c814d754-cdc5-4b0d-8671-a39e85b2c473',
    '797b2563-5d56-4f5c-bdaf-3bfd11e8f5b3',
    '85287d84-fefe-40df-ad40-5b135ee0c07f',
    '7eeede5a-c86f-4577-9f3c-65ab618a6dee',
    'a52c04ee-cfef-46bb-9b40-6a9b292e1a7b',
    '0194eea9-d779-4957-8521-11717a378e66',
    '0d82658c-44b3-4cea-a388-3353a96a31ef',
    '77dca0d5-4d22-4415-8858-075590d25cb5',
    '1b05159b-6277-4345-9d59-f7bba5ea1d56',
    '92299ff4-c0d0-4e94-b374-75d0038cbd68',
    '60f96582-79ad-4461-9f9a-53c1bc3d17b6',
    'cc107de7-d623-464a-a875-c8b7ae5fb09d',
    'b5d7c34a-c383-4fc7-aa4d-b6dc941cd41a',
    '85baa8f8-619c-4165-9d3e-53220f645814',
    '3f278de3-f201-4344-9639-d35cd7a62adb',
    'c423b18d-12f2-43e4-97f9-993e2943270e',
    '416b8daa-9537-46db-ae7b-3f5ff5f01dc3',
    '61940344-e6c1-484e-ba10-131f43a9b13a',
    'ceee2791-0fdf-45fc-a4e8-8077916771aa',
    '5205f817-7de0-48b4-89fc-6398cf13bff7',
    '1d2f5472-ab6a-4a9b-ba53-520858cf79db',
    'f757278a-3c74-4690-bf89-5149d21ff3af',
    '5a103ab3-29c3-4d07-a0f6-4999c256cf26',
    '2a263db0-8c33-4171-840f-54bf4755a4b9',
    '9828f3fe-f676-4bf1-b600-5effa24ea9c8',
    '28849dc9-a97f-469b-b2ac-a8ff97693f02',
    '29cd0578-fb47-495a-8f48-b37325eed81a',
    'ba503d2e-48af-48bb-910a-be41790d921c',
    '472f01ad-7bc3-4fe5-9771-2695930dbc95',
    '956cd931-7077-4a08-9c75-ab8b4e5d1eb8',
    '31a42df0-29f6-4d4e-ae5c-2e13abc355f2',
    '13364604-ed08-4a61-89cd-65eb372ac8c3',
    'b724164c-712c-4615-97b7-529a108a753a',
    '22199347-9454-41e5-8912-eb38edd33a25',
    '60cadee3-9e63-4897-ac81-4fb283033648',
    'dcd4112f-09d3-43ed-8441-df9bf4c9ddc8',
    'da02c3f1-371b-4afe-9b5e-b8c584fd5907',
    'f6565f2f-4478-45ad-8c11-04dd242fc6a9',
    '275ea204-4612-4d3c-ac0d-f110f61d62ad',
    '72f73fc5-6a3a-43a0-8cce-09f4726b736c',
    'be72f1e3-b5f5-43f4-80db-6d7de93a654e',
    'b6bf4699-6f61-4c6a-9d42-ad055a0de008',
    '8abf299c-cd4e-4ce0-b5cf-4f9abe8cc891',
    '0b6eb077-2eca-4fe6-b012-26fab725b907',
    '5c659e81-e687-4710-a4fd-000ca593155d',
    '42965913-4223-484a-9b3d-abc0002d277d',
    '72e639d1-b8c5-45fd-9acd-a8e5e2b7fa0d',
    'd6823ccd-7247-4efc-8841-f53f456351ed',
    '1048a860-d5ff-4f61-95e5-851e1266d4c1',
    '8681cdcf-b775-4b56-aace-3f3e448261ef',
    'f0db3b27-c952-477d-bc33-9b96a250e168',
    '9e1a6a7d-b45e-4fd2-a1ff-df131da4c713',
    'd596ee91-481c-4eb5-9a8a-88c1e10ba9b6',
    '24470eb6-97c2-4cd4-b484-87a7d634c5b3',
    '3a781e70-cf6e-41c2-8d68-2326f16986e7',
    '048afc84-cdd2-4b39-8ea5-7351f4699761',
    '3fdcdafd-5328-418b-85f8-47b0006de468',
    '032d39fb-d278-427d-b7d2-de648a25a20c',
    '27acea14-41c9-4bf9-ad43-3ebb3ce90456',
    'ec6f49a2-176c-4564-82c5-e751baab46aa',
    '68ea655f-b4a3-43e2-95e4-f158ca2d67dd',
    '0447c960-bbfe-4e42-a95b-dd3d1d9a368e',
    'cc19d19e-6f7e-41b8-87a6-77f41d53e650',
    '16031a34-f1ba-4bde-af43-1822f1516944',
    '483d3454-54da-4243-bbeb-98cbf1d088d0',
    '6765ce2d-ebc8-4367-8855-c0f8e62cb355',
    '1b4a324a-5621-4399-85d2-f91aa03418b5',
    '09642596-d33a-4261-8bf7-eb1dbb37d572',
    '75119ed5-b8aa-4f45-bdef-e3c673bbe44c',
    '04a874df-c57b-40fc-9139-bc3a05129115',
    '2b8ad26a-e66e-4b03-a65a-5b504cecacfd',
    'a7226f10-bdba-4284-97b3-0738a5912770',
    '93b2ac60-2208-4ef8-a1c2-68a623e45807',
    'a963c15d-9c97-49e4-af95-cdee96333a76',
    'fb5d9952-ebe7-4ee6-ba00-819ed00f3593',
    '8da05494-fe7a-4af5-b257-bada143ee426',
    '8b098ab4-df02-4619-8ded-657e496695c1',
    'd48adc59-8934-41bb-9720-63e71f1933be',
    '8e88cabc-e713-44ed-a5d2-41935c3b4eb5',
    'be8cfc23-cd19-46fb-92e1-a77ac380d7aa',
    'e2a398ff-18c3-4258-9d75-89adb2923e88',
    'cb8ebcd0-bb5e-4a6d-bfef-5c651a1a9f6e',
    '6238f8f7-5efb-4023-8d85-ef7db9b4dad7',
    '32c09444-3d4a-44d5-af6b-07eef92189db',
    'd7686f98-05a4-45c9-af2e-3ebc524a5b2d',
    '1939b7ae-fc6b-42a8-ad5f-dc51a1682a17',
    '8ccefc59-38a5-476f-b7d3-3f98315a97f0',
    '2cda53ba-b852-47e8-8f24-59ab8e9f1d1f',
    '6e67e1e1-5c39-43da-960f-48385789c4e1',
    '92382848-f5e9-426c-b7dc-f2841ae97018',
    '4999a410-990e-484b-b4f3-d636f894a741',
    '1f534eb4-701f-4182-9895-64c5e5b52d82',
    'd01a4268-1bfe-4a2d-a2d4-e296162c406e',
    'feca4815-b44b-4b2b-8d77-75edd62ba5a6',
    '039dd3d6-0cb5-4cd1-86b3-e9579c9b5218',
    'd0ce8b95-9c3b-4f9e-8ce0-169fd89a8b20',
    '7427b2eb-a84f-413c-bfb0-7d2e36b0628f',
    '9d796a02-e2aa-4c15-b8d6-1e90cd736681',
    '28c3df75-0b08-4d5c-9feb-6e2e918572ea',
    '433e3a09-661a-46a5-96f2-dbb07bdc87f3',
    'f69c21e9-cb5f-4e72-acfe-c54b672a9f3b',
    '2ef4530a-cc36-4f32-9a1a-63a555346587',
    'e917c83d-c482-442d-81ce-869de7d20903',
    '10774229-1487-4188-b2c3-1fabcf85492a',
    '65793118-3c88-4185-9172-2354850e6056',
    'fcb03f4f-e685-4803-aadb-0e8940ff4f37',
    '46536136-08e4-4521-8e6c-67f023de020d',
    'c2f0e7cf-ac07-48f7-b5f1-497ee6c134b2',
    'b8e7fe18-9c3d-4cc0-bbc7-85b27197fc8f',
    '3abfc362-7e73-4663-9dcf-07b78b9aa2d4',
    'b60b4737-c646-4299-85a0-520890e830b7',
    '280c5d6f-39a3-4d1d-aad2-a174451cd9b2',
    '2d434f2c-6aaa-46b2-ada9-de4b887e13d3',
    'bad1fb5c-d263-48d7-8e4c-fa873a17d707',
    'e4ccd185-2b0c-445d-9c57-0dc45c8f9d7e',
    'd48db47e-acba-4377-b031-f6dfc21f3658',
    '3fd2204c-8654-4af7-832f-c186447262e0',
    'ae34e63e-13af-48b8-8b72-8137289091b3',
    'd3ed2595-b8be-40c8-b7b6-10a4997b9d2e',
    '575dc7da-58ed-407d-9e88-7b586f28bf65',
    '20ddfcd5-d456-431b-9f05-781e05d873d6',
    '15d41c35-943c-474b-afa6-e1c6d6e4be2b',
    '61803dc8-f649-43e5-ab15-d351f2cef629',
    'abe58d43-e1c7-4953-aa41-4d3b6f6cca44',
    '3ef7966a-ec1e-4dba-9d31-cdb33692e78f',
    'fa278604-7d85-4491-a30d-15c7821f8b00',
    'dd6c6688-b73a-464c-86d9-3369fdf98268',
    'b5c0bf91-9d20-41a2-9dd2-87d0ef0310f9',
    '97c636f9-0983-481f-8ff9-7b5b3ee6b10e',
    '15b153f5-ed02-4216-8f96-99743b8b4fc3',
    '747858c0-d139-4f52-9f0e-a618b880d6d6',
    '32bc49c6-7583-4613-a72f-5edb12b2a808',
    '3eb8ea77-4605-4bb7-90f9-671953abe4a2',
    '2b08cb76-061d-44c6-a00f-b43a5421df5e',
    'ab7e390a-adc5-4f9e-b317-a216a2904c93',
    'd1e6d0e4-d49e-4a16-93c6-7956b2c03414',
    '713f8676-8034-4827-bccc-cd6d95b1a4c4',
    'b00883d8-9251-435d-aefc-8a703d96d2fb',
    'eefbea02-0d65-441e-b455-35aa21d25ba3',
    'ff7e3be2-c0ac-4d97-85da-6229bf7585ac',
    '7ac92a42-e112-49c3-a8f5-8ad2c7ef5578',
    '0701aae2-8661-4eec-84e0-7c8be1c89a18',
    '7efb1905-34b4-4f1c-a8a6-8e64b3640a68',
    '11a2b088-8c1c-47d2-9c1e-455d457d2f05',
    '74608bd9-39e4-4f48-9b7c-1cd9d3c599c9',
    '7baf8e8c-de11-452d-b2e1-aad7c08cc18e',
    '23b0219d-0820-4017-b942-bda8578e90e2',
    'd7bcfc5d-e258-4bd6-a413-bb7a118e6bff',
    'a5f631ea-2b4b-43f2-9ea0-e31f2b11fa27',
    '8523489a-f57c-4993-81e4-1ed86a5c092d',
    '395da421-e6e8-4a26-ac93-eb7050a7cb1f',
    '9cb5ce25-38e8-4628-9ddf-d6aedf5efe0f',
    '615f6246-1c39-4e44-a9d4-c7133a2ae62d',
    'a647528d-925e-4c02-8825-ff54720c6ee4',
    '2c6f63b2-439e-499f-b687-b3fdd88a492e',
    '68a916af-2e0c-41bd-8535-c7eacbc2d1b7',
    '0e7f31a0-c712-4ebf-ab3a-64c37f43e52a',
    'c8b1d323-f352-482e-bf17-82075c23dcee',
    'd30f51c7-d642-4e7d-a168-967b9520a80a',
    '8d89608c-0d61-4d71-a2e3-9fbc6cda69bf',
    '71219f56-551f-4ad4-9a38-cc4aaf8a1e9a',
    '48dd6010-77dc-465b-a27c-695e29b57a5e',
    '3376a8b6-7ef6-4191-97ab-a547da0d330d',
    '21384132-1697-4e9b-b863-a6492d13285d',
    '582f5f8d-b96f-490e-b417-ba824baeb06c',
    '7e825ee6-7c03-43cc-b0a4-0d9203a30bd9',
    '2843292e-e494-4642-90e0-57e5c153f12c',
    '4ecbb7c8-0246-47f8-9654-4caca1d52565',
    '7e3ea1bd-95ba-4cad-90c8-3eec95be9cc8',
    'bbba696b-d023-4bb1-a213-c8bee31e8bae',
    '00bd45f9-beb2-4fb0-8680-bd30e392975a',
    'f85e467a-958f-4da5-a01b-8df883e69122',
    '470eee0f-2053-4d9b-9f5e-ca9661a6cc16',
    '5c1dc76d-b703-445c-9b38-cc2d00b9ab16',
    '0b25d09e-b2d9-4452-9810-1d0ef777f9d6',
    '608d793e-a78b-4872-a50c-21a9eaa60ec3',
    '74ede771-6781-4980-bfb9-5d853b7cdd6f',
    '6c47e282-5d5e-445c-b6bd-c0024946fbe0',
    '6ac178b7-a923-407f-8cd8-1733e1b2ebd5',
    'ff8ffbcf-c932-48c7-8d5e-d995d5680e21',
    '9d74b4f0-b2d4-46aa-867a-52fb6102bfdf',
    'e34f15f7-c225-4314-a638-90504bb0aa0d',
    'c1d222ab-bc0e-4e13-8379-0ee5be9e140e',
    '582187a5-ad63-4759-9162-55fa6337eb07',
    'd1e8d19a-970d-4ede-b5bc-9cab7237adec',
    'af867604-d801-41cc-9949-017eb30a0cbf',
    'c9dd3578-01db-4687-9807-4f71368941d1',
    '722e332c-fb1a-45fe-80c7-cc670f025b7f',
    '9f152896-ebf1-4756-b678-bdf739a92256',
    '478aa270-fbd4-4a45-8f63-221b4066168e',
    'e9c7ad29-2213-4648-9164-33a07bd42cdb',
    '1d140c76-a06b-42a0-bae8-b9e169ebe394',
    '5edcc3db-c676-412a-9506-600959bb81f2',
    '9f7dbe05-96b5-4b2f-9f3a-34b552e3dd21',
    'ccc524ab-d9ad-467c-a25b-9a14fb05e976',
    '9e3fb02d-dcf6-486f-a42d-89446a852057',
    '4b341ba9-49a5-43a2-9b7e-cc96beb59946',
    '15ae6390-6f6d-4fd8-9a51-ecf988676c4d',
    '3a3100bb-369e-47c1-a77c-2cacb7cf020d',
    '2c11b505-17c8-402e-8422-0239accb449d',
    'a6d7e030-e6c8-4c62-8cb5-165ef54987c4',
    'e25a8172-1e34-442c-a45d-583027a2d734',
    '0c18589c-6432-4a6c-90ce-985a47a66f39',
    'c911503c-f010-4c17-ac57-1d82e954bdc7',
    '487016d8-ea02-4b20-a45f-7382139aa865',
    '677f0bdf-6c5c-462b-8294-3666f777bbc5',
    '34da5c11-bbe8-4e55-8d89-9ef8a1c66200',
    '9a4d9d5f-72aa-4d7d-90f1-6d1181ee984c',
    'e6b15b39-daba-431f-a918-e4e43e702c30',
    'bef62e8a-5f5c-4e81-a8f8-ddeaf657b4e8',
    '128dce74-fa37-4f2f-8a80-d542edd81a11',
    '841970b7-bed0-4a75-a28a-a4cc59740a84',
    'a5f53fc8-8f9b-4e9a-af63-6f8c54d478b2',
    'f461fca1-80b2-4980-83a8-e165d49acc18',
    '37f0f1f9-83fb-49a1-9941-093c068c32d0',
    'cfb3dad7-c6d9-47c0-81b0-2133d75f5c0d',
    'c5c0893f-b254-4038-8d08-b28ef5a26b5d',
    'bbcf8529-1a04-43fc-b6cf-cb161028159d',
    '06421648-dfcb-4460-b93b-c7d6804dddbb',
    'e0b28b59-1cb5-44f4-ab8f-badf5c74f69f',
    '631deea0-2821-4d14-ad02-dc0ce4864924',
    '95788aa7-c897-4ae8-9166-4b8fc1fc5342',
    'eede320a-ed63-41d8-960d-5405a26a194f',
    '36dccf81-6932-43ae-9864-53379832d878',
    '9102024d-58c0-4bb9-aa55-12c00d98b6cd',
    '01eaf423-8cab-491a-b82e-6915dbc73594',
    '0481a135-9db1-424f-9065-a83ebd7ec995',
    'b60876c5-d825-4303-befb-ffff55b92aba',
    '49022563-1be1-4e42-a11c-01743cd5c94d',
    '64f2dbe1-6f58-493d-ab6b-c93568d828f4',
    'a9626803-72c2-4e23-968c-a090e3f22c5e',
    '095728d6-4ea1-4909-8a74-a8f3fa7f86cb',
    '8309cd89-a912-462a-90ad-f13ae0d7aa6c',
    '902596ce-714e-49b3-8271-f3dfece52309',
    'f5f29e4b-68f7-443c-b290-0827d4167fd5',
    '07c3a7f4-1e59-4dcb-a244-2fd3d084e2b0',
    '6f49717f-8f57-42d0-8548-316ecc292415',
    '7e693091-8ae4-4c40-8e66-c3b39f01b90e',
    '544f643d-b19f-4aa0-a6ec-a90e1a8681d6',
    '5137255f-0c58-4ac7-9266-bda8ab0247c2',
    '5243df74-712d-49a8-989b-528d15088e8f',
    '278a26cb-a710-4fff-928e-fc2e7084a75a',
    '58a1d168-8290-4c69-bf01-17ba3a084365',
    '9fc492f3-8d13-47ae-93e9-812c0224f1aa',
    '822d381e-cea0-45bb-8fa0-1b7194b4b64b',
    '2dc01a50-ea7b-4d9b-be57-1ffbdd98b27b',
    '5b6676dc-f46e-43a8-b87c-e431e369e53c',
    'eb35085f-0cbf-4829-a3ad-acaa53a250b5',
    '36fa2d20-622d-4cca-80b0-683672c94170',
    '44161b51-953d-4f6b-9448-5cba4a44a9d8',
    '0b90b2ea-8ca3-406a-9f69-95eddf7699ef',
    'bfb202bc-4078-4df0-82b9-9218dbc1f1a1',
    '8cc59f51-b0df-4a5d-a3c5-83ee526ff1af',
    'fda7c4b9-9f35-482b-9eff-be7f11058d94',
    '85646f4a-e424-4363-8033-1e7522e8f175',
    '8945794e-174a-49f9-a2d4-4242f9bf3833',
    '529343b4-698a-4b36-ac55-db8a6965ad3f',
    '0f949ee9-0986-42b2-af5d-0f4c8338c664',
    'cb1f06fa-b916-477d-8ab6-fb4b3f24efd3',
    '05253b3c-e8a3-4db4-8a6d-014eac7b3d94',
    '4807db90-b0f7-441d-b489-932f9b341f74',
    'c33b1f32-6021-4d1c-a4d5-fc3d501107f4',
    'aa314675-af62-41df-b5cb-3b22558e903b',
    '20741062-7d1d-44b7-bc33-39c9ad26e414',
    '69ce1be3-1815-43a4-bdd2-4696d9c8d09a',
    '76dd508c-aa80-4e54-9ac4-23b5e0545316',
    '1c6bef41-3cfa-46b2-b183-0a523e417457',
    '18716daf-4223-44a9-bba9-fc9baeef7d07',
    '475430c5-28cb-456d-9c5c-bdbfab9fafb2',
    'f0061cb3-688e-4ad4-aeb8-8614282292ec',
    '6905d8d1-da77-4f7c-86e5-3af7db2b00b4',
    '3a89c170-2939-4c12-9940-f32d96fa9e55',
    '9a06c401-da3f-41b4-b38b-238796fcae09',
    'b32d88c8-31e3-4789-a75f-e52bf1272937',
    '2a81cd6f-aa6e-436b-b4ba-68d5f713fb07',
    '5e0e8f9a-ce97-4b18-9540-3015c61e393c',
    '1c8ba244-1c7f-433a-825b-d2d34d018dcf',
    'efcdb584-7659-4780-9d6d-e6599fb0033c',
    '373ff2e8-0f63-4179-a55c-3fe0b85556aa',
    '352a503b-41eb-4a84-b257-68d70e55337e',
    '737d39b8-2f99-4eac-bcda-a03996e08939',
    '7577f264-8e84-440d-9346-7c4d5affda51',
    'febd8561-4769-4f3b-b7c0-ae7ff6ede2e9',
    'b8c5b185-8669-43d1-8ec7-c0f6d223d505',
    '31e61d00-61cc-46f2-a793-8ea8dfbb0832',
    '2355554e-8951-4b41-bcd8-32e18cddb7c9',
    'de1e7762-673f-4d44-8f45-7e693bb338b7',
    '239a484f-67c2-4ba3-a3d0-d6e4c2b27475',
    'af6c6f09-f0d2-46fe-bda0-c6fa5901c4a3',
    'c6f3bd64-ea67-488f-904f-f0bdf6320b5c',
    '166746e8-ce26-4fa1-a587-443ca9fc59a1',
    '80baf71d-28d0-4bca-81b7-49ddfadfa7a3',
    '263ab7c9-bd69-45dd-abb7-bbf35b9786ed',
    'e0c7877e-75d7-47d8-b5e9-5dd677d03353',
    '49a97523-0a7a-4d5a-ae20-496f86de2032',
    '583023a1-aa12-40e2-a964-8ad50ad400ba',
    'c56f0a76-2b91-4860-8dff-63c9504bb0e2',
    '17d3ffb4-e891-4ac6-a91a-fd52971c1115',
    '5203f051-7e84-4969-b4ce-eda56a859793',
    '488a38ee-f996-482d-a562-a4474f5594de',
    '680d748c-7c60-46e2-aea5-7fc557a916ea',
    '462d992a-7c13-45ac-a6da-1254fc3a9031',
    'dbb4df81-9115-45d1-b51d-875e0669edc4',
    '6fd0f009-3c34-4529-9a38-c59745545490',
    '74d1e549-5ae8-4410-9428-f8f2cc85fa80',
    '868f72af-99e8-406e-9f7e-14577e6c7157',
    '73f7d2b4-86ec-4f7e-a1f9-37c7b023e3bf',
    '595b6755-e7ae-4e83-af2e-693c089aeec3',
    '4d01e12e-503e-4447-8e49-8c2b77ffb00d',
    '732eaae3-b509-4a7a-8961-09d861e55253',
    '279e5670-8a47-4992-bb10-14e6c719db97',
    'd306000b-88c1-4220-8d7e-933c0118a983',
    '703c4bc2-81bf-435a-87fa-21dc9278bad6',
    'ab76b5ca-e464-4063-b949-853f61036370',
    '672b617f-936e-440a-a735-80f94798aed1',
    '516ceb43-1378-4c02-88fc-a1d2a2258d59',
    'c9986260-0c1b-4fd3-8132-6fa7353046e6',
    'bda2bec8-a142-47ab-bfb4-83759ac2bddd',
    'e858d4f9-3385-4640-b0cb-4894e86d501c',
    '39fe0c8b-bd78-4565-9415-63eabc1d6d85',
    'da4e904f-0346-4cd3-a5c2-ba932511d98d',
    'c46c2220-da88-4f60-a0cf-eebfd0a8ff12',
    '629e31cb-dd7b-4345-abf2-fa23c6c65a09',
    'e9a57082-5a93-481a-bbd0-1acb03ac751a',
    '0faf149d-b316-4fbd-8605-a59354f0eacd',
    '1d23d3cc-5db6-4734-bfaa-507dd366d99b',
    'dd58f556-0049-49c3-9a51-d6470a2abddc',
    '53185d06-f2cc-4942-88c4-8534b559a9ff',
    '0132f320-830d-40d0-a4da-06a5d5f9e8d9',
    'bb7d6408-941a-4da6-8613-36498bc6d91b',
    'b8d11ca1-3db8-4efa-bf57-0305e004a26d',
    '9bd56ad6-080e-4d26-acca-83e4df8aa913',
    '00c11c7e-8530-4bfc-abd7-8c10f4c602d3',
    '8f6b9e20-9468-4f46-aa45-eeab9de88e53',
    '272dff18-acf3-4874-a55f-ba8fb6f80352',
    '70ac3659-06bb-4022-be55-af81d3e35b6f',
    '409b92cf-5c4d-4997-9736-ef2ea10d19e9',
    'a8636719-e26c-49b6-9a53-7d77f3d3c94b',
    'a3e81d5f-8dd6-43dd-9172-d80d212efa2d',
    'd40af129-c13f-45b2-92f0-d0e8fa5cc1c9',
    'ecd2d2f9-2b6f-4743-8d04-c9bb554a96cb',
    '9ee78822-7acd-4fab-9999-c58e9fe266ad',
]

# Loop through datasets and pull results
df_results = pd.DataFrame(columns = ["dataset_id", "table", "column", "value", "row_count", "status"])
for dataset_id in dataset_id_list:
    
    # Establish API client and pull dataset details
    print(f"Processing dataset_id {dataset_id}...")
    api_client = utils.refresh_tdr_api_client()
    datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
    try:
        response = datasets_api.retrieve_dataset(id=dataset_id, include=["SCHEMA", "ACCESS_INFORMATION"]).to_dict()
        bq_project = response["access_information"]["big_query"]["project_id"]
        bq_schema = response["access_information"]["big_query"]["dataset_name"]
    except Exception as e:
        print("Failure - Issue Retrieving Dataset Info")
        continue
    
    # Loop through non-array fields and pull data
    for field in nonarray_field_list:
        table_name = field.split(".")[0]
        field_name = field.split(".")[1]
        client = bigquery.Client()
        query = f"""SELECT '{dataset_id}' AS dataset_id, '{table_name}' AS table, '{field_name}' AS column, {field_name} AS value, COUNT(*) AS row_count, 'Success' AS status FROM `{bq_project}.{bq_schema}.{table_name}` GROUP BY {field_name}"""
        try:
            df_output = client.query(query).result().to_dataframe()
            if df_output.empty:
                df_output = pd.DataFrame(data={"dataset_id": [dataset_id], "table": [table_name], "column": [field_name], "value": [None], "row_count": [0], "status": ["Success"]}) 
        except Exception as e:
            print(f"Failure - Issue Pulling Data for {field}")
            df_output = pd.DataFrame(data={"dataset_id": [dataset_id], "table": [table_name], "column": [field_name], "value": [None], "row_count": [0], "status": ["Failure"]})
        df_results = pd.concat([df_results, df_output])
    
    # Loop through array fields and pull data 
    for field in array_field_list:
        table_name = field.split(".")[0]
        field_name = field.split(".")[1]
        client = bigquery.Client()
        query = f"""SELECT '{dataset_id}' AS dataset_id, '{table_name}' AS table, '{field_name}' AS column, {field_name}_unnested AS value, COUNT(*) AS row_count, 'Success' AS status FROM `{bq_project}.{bq_schema}.{table_name}` CROSS JOIN UNNEST({field_name}) AS {field_name}_unnested GROUP BY {field_name}_unnested"""
        try:
            df_output = client.query(query).result().to_dataframe()
            if df_output.empty:
                df_output = pd.DataFrame(data={"dataset_id": [dataset_id], "table": [table_name], "column": [field_name], "value": [None], "row_count": [0], "status": ["Success"]}) 
        except Exception as e:
            print(f"Failure - Issue Pulling Data for {field}")
            df_output = pd.DataFrame(data={"dataset_id": [dataset_id], "table": [table_name], "column": [field_name], "value": [None], "row_count": [0], "status": ["Failure"]})
        df_results = pd.concat([df_results, df_output])

# Aggregate and display final results
print("\nDataset Level Results:")
df_sorted = df_results.sort_values(["dataset_id", "table", "column", "row_count"], ascending=[True, True, True, False], ignore_index=True)
df_sorted_path = "search_facet_values.tsv"
df_sorted.to_csv(df_sorted_path, index=False, sep="\t")
!gsutil cp $df_sorted_path $ws_bucket/ingest_pipeline/resources/search/ 2> stdout
display(df_sorted)
print("\nAggregated Results:")
df_results_agg = df_results.groupby(["table", "column", "value"])["row_count"].sum().reset_index().sort_values(["table", "column", "row_count"], ascending=[True, True, False], ignore_index=True)
df_results_agg_path = "search_facet_values_agg.tsv"
df_results_agg.to_csv(df_results_agg_path, index=False, sep="\t")
!gsutil cp $df_results_agg_path $ws_bucket/ingest_pipeline/resources/search/ 2> stdout
display(df_results_agg)
        

## Pull based on mapping queries

In [None]:
# Input parameters
dataset_map_pairs = [
    ['cefc1a79-446c-40d2-b140-ba8d8b1c0712', 'gregor_1'],
    ['4e699ead-bbb5-460d-9b32-2b1b322c601b', 'gtex_ext_2'],
]

# Loop through dataset-map pairs and process:
df_results = pd.DataFrame(columns = ["dataset_id", "mapping_spec", "attribute", "source_value", "mapped_value", "record_count"])
for dataset_map_entry in dataset_map_pairs:
    dataset_id = dataset_map_entry[0]
    mapping_target_spec = dataset_map_entry[1]
    
    # Establish API client and pull dataset details
    print(f"Processing dataset_id {dataset_id} with mapping specification {mapping_target_spec}...")
    api_client = utils.refresh_tdr_api_client()
    datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
    src_schema_dict = {}
    try:
        response = datasets_api.retrieve_dataset(id=dataset_id, include=["SCHEMA", "ACCESS_INFORMATION"]).to_dict()
        src_schema_dict["tables"] = response["schema"]["tables"]
        src_schema_dict["relationships"] = response["schema"]["relationships"]
        bq_project = response["access_information"]["big_query"]["project_id"]
        bq_schema = response["access_information"]["big_query"]["dataset_name"]
    except Exception as e:
        print("Error retrieving source schema from TDR. Error: {}".format(e))
        
    # Retrieve target schema and mapping specification
    target_schema_dict = {}
    mapping_spec = {}
    storage_client = storage.Client()
    bucket = storage_client.get_bucket(ws_bucket_name)
    try:
        blob = bucket.blob(f"ingest_pipeline/mapping/anvil/mapping_schema_object.json")
        target_schema_dict = json.loads(blob.download_as_string(client=None))
    except Exception as e:
        print("Error retrieving target schema for specified mapping_target. Error: {}".format(e))
    try:
        blob = bucket.blob(f"ingest_pipeline/mapping/anvil/{mapping_target_spec}/mapping_specification.json")
        blob_string = blob.download_as_text(client=None)
        blob_string = blob_string.replace("$DATASET_NAME", "Dataset").replace("$PROJECT_NAME", "Project")
        blob_string = blob_string.replace("$BQ_DATASET", bq_project + "." + bq_schema)
        mapping_spec = json.loads(blob_string)
    except Exception as e:
        print("Error retrieving mapping specification for specified mapping_target and mapping_target_spec. Error: {}".format(e))

    # Update aliases in mapping specification
    mapping_spec = bmq.update_mapping_spec_aliases(mapping_spec, src_schema_dict)
    
    # Evaluate vocab maps
    df = bmq.evaluate_vocab_mapping(mapping_spec, src_schema_dict, target_schema_dict, bq_project, bq_schema)
    df["dataset_id"] = pd.Series([dataset_id for x in range(len(df.index))])
    df["mapping_spec"] = pd.Series([mapping_target_spec for x in range(len(df.index))])
    df_results = pd.concat([df_results, df])
    
# Aggregate and display final results
print("\nDataset Level Results:")
df_sorted = df_results.sort_values(["dataset_id", "attribute", "record_count"], ascending=[True, True, False], ignore_index=True)
df_sorted_path = "search_facet_values_v2.tsv"
df_sorted.to_csv(df_sorted_path, index=False, sep="\t")
!gsutil cp $df_sorted_path $ws_bucket/ingest_pipeline/resources/search/ 2> stdout
#display(df_sorted)
print("\nAggregated Results:")
df_results_agg = df_results.groupby(["attribute", "source_value", "mapped_value"], dropna=False)["record_count"].sum().reset_index().sort_values(["attribute", "record_count"], ascending=[True, False], ignore_index=True)
df_results_agg_path = "search_facet_values_agg_v2.tsv"
df_results_agg.to_csv(df_results_agg_path, index=False, sep="\t")
!gsutil cp $df_results_agg_path $ws_bucket/ingest_pipeline/resources/search/ 2> stdout
display(df_results_agg)


# Pull Study/Consent Info Across Datasets

In [None]:
#############################################
## Functions
#############################################

def check_phs_and_consent(dataset_id_list):
    
    # Loop through and process dataset IDs
    results = []
    for dataset_id in dataset_id_list:
    
        # Retrieve dataset information
        logging.info(f"Processing dataset_id = {dataset_id}...")
        api_client = utils.refresh_tdr_api_client()
        datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
        try:
            response = datasets_api.retrieve_dataset(id=dataset_id, include=["ACCESS_INFORMATION"]).to_dict()
            bq_project = response["access_information"]["big_query"]["project_id"]
            bq_dataset = response["access_information"]["big_query"]["dataset_name"]
        except Exception as e:
            error_message = f"Error retrieving dataset details: {str(e)}"
            logging.error(error_message)
            results.append([dataset_id, "All", "Failure", error_message])
            continue
            
        # Review the recorded PHS ID and consent code
        client = bigquery.Client()
        query = """SELECT title, registered_identifier, consent_group, data_use_permission FROM `{project}.{dataset}.anvil_dataset`""".format(project=bq_project, dataset=bq_dataset)
        try:
            df = client.query(query).result().to_dataframe()
            results.append([dataset_id, df["title"].values[0], "".join(df["registered_identifier"].values[0]), "".join(df["consent_group"].values[0]), "".join(df["data_use_permission"].values[0]), "Success", ""])
        except Exception as e:
            error_message = f"BigQuery error: {str(e)}"
            results.append([dataset_id, None, None, None, None, "Failure", error_message])
            continue
                
    # Display results
    logging.info("\nResults:")
    df_results = pd.DataFrame(results, columns =["Dataset ID", "Title", "PHS ID", "Consent Group", "Data Use Permission", "Status", "Message"])
    display(df_results)


#############################################
## Input Parameters
#############################################

# List of dataset IDs to examine
dataset_id_list = [
    'e2a398ff-18c3-4258-9d75-89adb2923e88',
    'd7bcfc5d-e258-4bd6-a413-bb7a118e6bff',
    '8da05494-fe7a-4af5-b257-bada143ee426',
    '8b098ab4-df02-4619-8ded-657e496695c1',
    '8e88cabc-e713-44ed-a5d2-41935c3b4eb5',
    'be8cfc23-cd19-46fb-92e1-a77ac380d7aa',
    '6c9423a2-3ea7-4c3c-9b12-0cc993bc095f',
    'ae50ef98-ef3d-4427-b094-83b2d90787a0',
    '3fbacc64-4c53-4770-8cdf-a616c10ec5c7',
    '0b0a52bb-a1a2-4638-9259-4447761c2da4',
    '0eb42259-7b44-450f-a9d7-500b2ea7179c',
    'e16adabb-88e0-4739-983a-98ac5c181842',
    '0e65b131-fd14-4fce-908b-c5b89a71a9c1',
    '352a503b-41eb-4a84-b257-68d70e55337e',
    '7577f264-8e84-440d-9346-7c4d5affda51',
    'febd8561-4769-4f3b-b7c0-ae7ff6ede2e9',
    'b8c5b185-8669-43d1-8ec7-c0f6d223d505',
    'de1e7762-673f-4d44-8f45-7e693bb338b7',
    '74d1e549-5ae8-4410-9428-f8f2cc85fa80',
    'db266afc-2f75-4b03-a3b8-c69e0ce6f713',
    '12bbfa4c-c30a-4cf6-b79f-45354f842964',
    '84ac0d05-4be5-43e9-973e-ef999144d802',
    'ff8b1212-858a-4048-8f63-9464c922591a',
    'c814d754-cdc5-4b0d-8671-a39e85b2c473',
    '797b2563-5d56-4f5c-bdaf-3bfd11e8f5b3',
    '7eeede5a-c86f-4577-9f3c-65ab618a6dee',
    'a52c04ee-cfef-46bb-9b40-6a9b292e1a7b',
    '0194eea9-d779-4957-8521-11717a378e66',
    '0d82658c-44b3-4cea-a388-3353a96a31ef',
    '77dca0d5-4d22-4415-8858-075590d25cb5',
    '1b05159b-6277-4345-9d59-f7bba5ea1d56',
    '92299ff4-c0d0-4e94-b374-75d0038cbd68',
    '60f96582-79ad-4461-9f9a-53c1bc3d17b6',
    'b5d7c34a-c383-4fc7-aa4d-b6dc941cd41a',
    '85baa8f8-619c-4165-9d3e-53220f645814',
    'c423b18d-12f2-43e4-97f9-993e2943270e',
    'ceee2791-0fdf-45fc-a4e8-8077916771aa',
    '2a263db0-8c33-4171-840f-54bf4755a4b9',
    '9828f3fe-f676-4bf1-b600-5effa24ea9c8',
    '516ceb43-1378-4c02-88fc-a1d2a2258d59',
    'bda2bec8-a142-47ab-bfb4-83759ac2bddd',
    '629e31cb-dd7b-4345-abf2-fa23c6c65a09',
    '956cd931-7077-4a08-9c75-ab8b4e5d1eb8',
    '31a42df0-29f6-4d4e-ae5c-2e13abc355f2',
    '13364604-ed08-4a61-89cd-65eb372ac8c3',
    '53185d06-f2cc-4942-88c4-8534b559a9ff',
    '22199347-9454-41e5-8912-eb38edd33a25',
    '0132f320-830d-40d0-a4da-06a5d5f9e8d9',
    '60cadee3-9e63-4897-ac81-4fb283033648',
    'bb7d6408-941a-4da6-8613-36498bc6d91b',
    'dcd4112f-09d3-43ed-8441-df9bf4c9ddc8',
    'b8d11ca1-3db8-4efa-bf57-0305e004a26d',
    '9bd56ad6-080e-4d26-acca-83e4df8aa913',
    'da02c3f1-371b-4afe-9b5e-b8c584fd5907',
    '8f6b9e20-9468-4f46-aa45-eeab9de88e53',
    '72f73fc5-6a3a-43a0-8cce-09f4726b736c',
    '72e639d1-b8c5-45fd-9acd-a8e5e2b7fa0d',
    '8681cdcf-b775-4b56-aace-3f3e448261ef',
    'f0db3b27-c952-477d-bc33-9b96a250e168',
    'b12fb9be-2ce0-4bfd-8503-732fabba06ab',
    'a963c15d-9c97-49e4-af95-cdee96333a76',
    'fb5d9952-ebe7-4ee6-ba00-819ed00f3593',
    '76dd508c-aa80-4e54-9ac4-23b5e0545316',
    '1c6bef41-3cfa-46b2-b183-0a523e417457',
    '6905d8d1-da77-4f7c-86e5-3af7db2b00b4',
    '3a89c170-2939-4c12-9940-f32d96fa9e55',
    'efcdb584-7659-4780-9d6d-e6599fb0033c',
    '373ff2e8-0f63-4179-a55c-3fe0b85556aa',
    '737d39b8-2f99-4eac-bcda-a03996e08939',
    '31e61d00-61cc-46f2-a793-8ea8dfbb0832',
    'af6c6f09-f0d2-46fe-bda0-c6fa5901c4a3',
    '3fdcdafd-5328-418b-85f8-47b0006de468',
    'c6f3bd64-ea67-488f-904f-f0bdf6320b5c',
    '166746e8-ce26-4fa1-a587-443ca9fc59a1',
    '595b6755-e7ae-4e83-af2e-693c089aeec3',
    'a3ea4f97-6657-4d3c-9be6-96f097f5c952',
    '9f4ac69c-0919-4ac1-98a8-976ed79ace03',
    '96461004-f4b3-4f82-a842-293b3ec46a60',
    '841b7883-9447-4ea0-ae4a-84ea0240d919',
    'd0fc3d6a-c3f4-4533-8a23-817a4e27f9be',
    '2cace5dc-f660-45d4-b689-c4c89e77697c',
    'd2272f2d-c606-4027-b8ea-0bdd6d9d6535',
    '179eb85e-2557-4677-9cba-d763310f3df9',
    '13b2076a-cfe1-49ec-ac61-bad1af9a52ea',
    '175dd803-02c7-4823-81d5-9e0621652ace',
    '15492baa-05ed-47bc-b50c-e587679ae51a',
    '33705ce9-b2b3-4edc-9b47-f54283e193cf',
    '92486440-3a46-44dd-b853-b300ef75b31e',
    '3725b660-1106-4173-9c4b-0a15926becf5',
    '8b8185d3-ba5c-4832-af23-3ff8ca6ed016',
    'ec97fa0f-e174-40fe-a6b8-ee240bdf4318',
    '5488d7c1-5195-4ebc-b0f0-31033fa06dc9',
    '56f9888f-e623-4a1a-b2b4-46378a6cd6fe',
    'b2e7f15b-65d5-4812-abfd-b2dbc6d18850',
    '69f8d7c2-2e14-48e6-b838-7881016313fb',
    '809fa952-3178-46b7-bb82-8a476ef32e67',
    'dfb14a1f-38b2-4668-b98f-59b5b5b53ca3',
    'f553b765-1c9c-464c-a8fa-07700a1691c5',
    'b108dfd0-711d-4bc1-aab5-1b312226c8ad',
    '38eca26c-d79e-4447-99d3-1889d20ade21',
    'd5a0e24d-689a-4854-92c7-9a39f980b523',
    'ecd0606f-4fa7-4e57-b6e8-eea377e65d5b',
    '0c6bc810-5ae0-4926-a56b-2bc2fe7dbe6d',
    '9dc31133-c882-4f39-903a-a25f316bb560',
    '23a0ede0-4f97-46af-9f04-bd2805050980',
    '7593c1c2-3680-4bf5-8a65-dce5f96a3b59',
    '02661394-2886-4ef7-aff1-d53225c82025',
    'c1644d4e-06e2-4fa8-95f1-5c1da5831257',
    '267cf516-dd33-4640-a71a-78bd8f5db9d8',
    'df06ff22-6a2d-4934-aac9-c8368efbea1a',
    'dcdefb14-f6de-4c46-ac7e-842b273416bf',
    '28208cc6-50bf-4864-9a48-981632066640',
    'ccfe264d-a35e-44f8-9b2b-241a0f8327cc',
    'f177843d-47fb-46ae-83be-73c92ee85081',
    'd049d487-1a69-4358-8dad-0e6fa6c06fdd',
    'da29226b-e856-4014-8c8b-c4268d0df2cc',
    '714dcdbd-8d17-40b0-8246-0e941af8175d',
    '53ce7d12-facc-4412-a710-f535efb209a2',
    'b9842819-5fd6-40c5-9668-aae1ea44a308',
    '7c056125-3ed8-459c-b73e-edfa3f80cc27',
    'ed1215f1-787c-40f5-9d77-4b5bc2dfbb84',
    '7feeb2b1-1926-4968-b6dc-e0a1e4cf8d4a',
    'dbf5d87b-4cab-44cd-a792-1d0218aad973',
    '85dbde76-c130-40b2-8a8a-ba815ba499da',
    '1c2fe11d-b020-4c54-8c71-1ea91623d626',
    '84133066-68cd-41fa-819b-d74a3ac85862',
    '54c6fa73-9b84-4a3b-9e97-e4e43165c48b',
    '90ba1853-f845-4502-ba36-b75b9e571bc5',
    'f3c88c3c-8e1b-4af9-9467-0621404e314c',
    '332bb145-6ef1-40ef-932c-aec5bb6210d9',
    '44f83f20-d618-40b5-b2cb-3676b8fe3ad7',
    '3c2c39a9-4cc2-4f7c-89e0-054a871e2c4e',
    '2ebb722f-a3df-4ea4-b72a-813e3db0bab5',
    '1817528a-4f88-4ed6-8965-9eae0220ab27',
    'e03eb011-05f9-4491-b779-0cc2aefabff1',
    'd4bb7169-5a7d-4090-ba62-12ea799c3ade',
    '3f172982-060d-4339-a09b-6994c2c9eb16',
    '51789659-5233-4ee7-8bca-dedebfc87773',
    'a5fe75bb-d28c-42fb-aaf8-92fa37b266d2',
    '6545d602-e5b4-4dd1-8f6a-64e0a1952ddc',
    'e5c79b74-20d5-4b6f-8085-0bc788eed2ea',
    'f492567d-6db8-45c8-b44e-6b5def26c812',
    '12e54f96-00d9-4a38-921e-e0d42610b2c1',
    '845b131e-7c05-4397-ad40-23dea8e9b399',
    'bd492b71-b20e-4056-b8ae-ad8c94cfbc02',
    '9ecc231f-e3d3-4417-a98a-c4db4c638161',
    '3fb2d04a-d18b-4bdc-9372-99b992f2ae42',
    'e922a496-e686-4fa1-911d-2159ceb0f09f',
    '52e015b5-22b7-4a96-9f0a-ea3afccbfcbc',
    'a3ae33bb-8b3a-47e5-a2d1-a49c954776b3',
    '5627cdbb-22a0-436f-a7a4-34d7ce21bb45',
    '655e6a61-5400-4d8a-95bc-1506e026b289',
    '02ff1051-cd1d-4bbb-a005-21384cbff846',
    '35a1009d-93a2-49b1-a801-fe84d6b7a2f5',
    '50132478-c9fb-4dc5-86cd-d5dfab909393',
    'f1513955-0264-4733-bd25-3f752c61a323',
    '93e712f2-3e54-466e-aa53-57eb69c43bc0',
    '2cbe079d-e7ab-47d8-836e-454a71440297',
    '84fad495-2756-472f-ad20-f91de6f67baf',
    '28e73469-12d4-493b-bf6f-83359c1f69c5',
    'dd6866e4-8949-45bd-8910-8ce64f79e3c7',
    '85287d84-fefe-40df-ad40-5b135ee0c07f',
    'cc107de7-d623-464a-a875-c8b7ae5fb09d',
    '416b8daa-9537-46db-ae7b-3f5ff5f01dc3',
    '61940344-e6c1-484e-ba10-131f43a9b13a',
    '5205f817-7de0-48b4-89fc-6398cf13bff7',
    'f757278a-3c74-4690-bf89-5149d21ff3af',
    '5a103ab3-29c3-4d07-a0f6-4999c256cf26',
    '28849dc9-a97f-469b-b2ac-a8ff97693f02',
    '29cd0578-fb47-495a-8f48-b37325eed81a',
    'ba503d2e-48af-48bb-910a-be41790d921c',
    '472f01ad-7bc3-4fe5-9771-2695930dbc95',
    'b724164c-712c-4615-97b7-529a108a753a',
    'f6565f2f-4478-45ad-8c11-04dd242fc6a9',
    '275ea204-4612-4d3c-ac0d-f110f61d62ad',
    'be72f1e3-b5f5-43f4-80db-6d7de93a654e',
    'b6bf4699-6f61-4c6a-9d42-ad055a0de008',
    '8abf299c-cd4e-4ce0-b5cf-4f9abe8cc891',
    '0b6eb077-2eca-4fe6-b012-26fab725b907',
    '5c659e81-e687-4710-a4fd-000ca593155d',
    '42965913-4223-484a-9b3d-abc0002d277d',
    'd6823ccd-7247-4efc-8841-f53f456351ed',
    '1048a860-d5ff-4f61-95e5-851e1266d4c1',
    '9e1a6a7d-b45e-4fd2-a1ff-df131da4c713',
    '032d39fb-d278-427d-b7d2-de648a25a20c',
    '27acea14-41c9-4bf9-ad43-3ebb3ce90456',
    'ec6f49a2-176c-4564-82c5-e751baab46aa',
    '93b2ac60-2208-4ef8-a1c2-68a623e45807',
    'd48adc59-8934-41bb-9720-63e71f1933be',
    '6238f8f7-5efb-4023-8d85-ef7db9b4dad7',
    '32c09444-3d4a-44d5-af6b-07eef92189db',
    '280c5d6f-39a3-4d1d-aad2-a174451cd9b2',
    '2d434f2c-6aaa-46b2-ada9-de4b887e13d3',
    'bad1fb5c-d263-48d7-8e4c-fa873a17d707',
    'e4ccd185-2b0c-445d-9c57-0dc45c8f9d7e',
    '20ddfcd5-d456-431b-9f05-781e05d873d6',
    '15d41c35-943c-474b-afa6-e1c6d6e4be2b',
    '3ef7966a-ec1e-4dba-9d31-cdb33692e78f',
    'dd6c6688-b73a-464c-86d9-3369fdf98268',
    '15b153f5-ed02-4216-8f96-99743b8b4fc3',
    '747858c0-d139-4f52-9f0e-a618b880d6d6',
    '3376a8b6-7ef6-4191-97ab-a547da0d330d',
    'f85e467a-958f-4da5-a01b-8df883e69122',
    '0b25d09e-b2d9-4452-9810-1d0ef777f9d6',
    '6ac178b7-a923-407f-8cd8-1733e1b2ebd5',
    'c9dd3578-01db-4687-9807-4f71368941d1',
    '5edcc3db-c676-412a-9506-600959bb81f2',
    'ccc524ab-d9ad-467c-a25b-9a14fb05e976',
    '4b341ba9-49a5-43a2-9b7e-cc96beb59946',
    '0c18589c-6432-4a6c-90ce-985a47a66f39',
    'e6b15b39-daba-431f-a918-e4e43e702c30',
    '841970b7-bed0-4a75-a28a-a4cc59740a84',
    'a5f53fc8-8f9b-4e9a-af63-6f8c54d478b2',
    'e0b28b59-1cb5-44f4-ab8f-badf5c74f69f',
    'aa314675-af62-41df-b5cb-3b22558e903b',
    '20741062-7d1d-44b7-bc33-39c9ad26e414',
    '69ce1be3-1815-43a4-bdd2-4696d9c8d09a',
    '18716daf-4223-44a9-bba9-fc9baeef7d07',
    '475430c5-28cb-456d-9c5c-bdbfab9fafb2',
    'f0061cb3-688e-4ad4-aeb8-8614282292ec',
    '9a06c401-da3f-41b4-b38b-238796fcae09',
    'b32d88c8-31e3-4789-a75f-e52bf1272937',
    '2a81cd6f-aa6e-436b-b4ba-68d5f713fb07',
    '5e0e8f9a-ce97-4b18-9540-3015c61e393c',
    '1c8ba244-1c7f-433a-825b-d2d34d018dcf',
    'c56f0a76-2b91-4860-8dff-63c9504bb0e2',
    '17d3ffb4-e891-4ac6-a91a-fd52971c1115',
    '5203f051-7e84-4969-b4ce-eda56a859793',
    '680d748c-7c60-46e2-aea5-7fc557a916ea',
    '462d992a-7c13-45ac-a6da-1254fc3a9031',
    '4bc5b4eb-da91-48f7-bca0-134ed1a484a0',
    '868f72af-99e8-406e-9f7e-14577e6c7157',
    '73f7d2b4-86ec-4f7e-a1f9-37c7b023e3bf',
    '732eaae3-b509-4a7a-8961-09d861e55253',
    'ab76b5ca-e464-4063-b949-853f61036370',
    'e858d4f9-3385-4640-b0cb-4894e86d501c',
    '39fe0c8b-bd78-4565-9415-63eabc1d6d85',
    'da4e904f-0346-4cd3-a5c2-ba932511d98d',
    'c46c2220-da88-4f60-a0cf-eebfd0a8ff12',
    'e9a57082-5a93-481a-bbd0-1acb03ac751a',
    '0faf149d-b316-4fbd-8605-a59354f0eacd',
    '1d23d3cc-5db6-4734-bfaa-507dd366d99b',
    'dd58f556-0049-49c3-9a51-d6470a2abddc',
    '00c11c7e-8530-4bfc-abd7-8c10f4c602d3',
    '272dff18-acf3-4874-a55f-ba8fb6f80352',
    '70ac3659-06bb-4022-be55-af81d3e35b6f',
    '409b92cf-5c4d-4997-9736-ef2ea10d19e9',
    'a8636719-e26c-49b6-9a53-7d77f3d3c94b',
    'a3e81d5f-8dd6-43dd-9172-d80d212efa2d',
    'd40af129-c13f-45b2-92f0-d0e8fa5cc1c9',
    'ecd2d2f9-2b6f-4743-8d04-c9bb554a96cb',
    '9ee78822-7acd-4fab-9999-c58e9fe266ad',
]

#############################################
## Execution
#############################################

check_phs_and_consent(dataset_id_list)


# Pull Metadata from Datasets

## Pulling Table Data from Dataset (Generic)

In [2]:
#############################################
## Functions
#############################################

def extract_table_data(dataset_id_list, table_inclusion_list, table_exclusion_list, restore_gs_paths, consolidate_results):
    # Loop through and process listed datasets
    print(f"Start time: {datetime.datetime.now()}")
    df_results = pd.DataFrame()
    for dataset_id in dataset_id_list:

        # Establish API client
        api_client = utils.refresh_tdr_api_client()
        datasets_api = data_repo_client.DatasetsApi(api_client=api_client)

        # Retrieve dataset details
        print(f"Processing dataset_id = '{dataset_id}'...")
        dataset_details = datasets_api.retrieve_dataset(id=dataset_id, include=["ACCESS_INFORMATION", "SCHEMA"]).to_dict()
        dataset_name = dataset_details["name"]

        # Record fileref columns if replacing gs paths
        tables_to_process_dict = {}
        dataset_tables = dataset_details["schema"]["tables"]
        for table in dataset_tables:
            table_name = table["name"]
            if table_name in table_exclusion_list:
                continue
            if table_inclusion_list and table_name not in table_inclusion_list:
                continue
            else:
                tables_to_process_dict[table_name] = []
                fileref_col_idx = 0
                for column in table["columns"]:
                    column_dict = {"name": column["name"]}
                    if restore_gs_paths:
                        if column["datatype"] == "fileref":
                            fileref_col_idx += 1
                            column_dict["fileref_idx"] = fileref_col_idx
                    tables_to_process_dict[table_name].append(column_dict)

        # Build query and pull data
        bq_project = dataset_details["access_information"]["big_query"]["project_id"]
        bq_schema = dataset_details["access_information"]["big_query"]["dataset_name"]
        for table, col_list in tables_to_process_dict.items():
            print(f"\tProcessing table {table}")
            client = bigquery.Client()
            with_clause = f"""WITH load_cte AS (
                                  SELECT * EXCEPT(RN)
                                  FROM
                                  (
                                    SELECT file_id, source_name AS file_path, ROW_NUMBER() OVER (PARTITION BY file_id ORDER BY load_time DESC) AS RN
                                    FROM `{bq_project}.{bq_schema}.datarepo_load_history`
                                    WHERE state = 'succeeded'
                                  )
                                  WHERE RN = 1
                                ), inv_cte AS (
                                  SELECT file_ref AS file_id, uri AS file_path
                                  FROM `{bq_project}.{bq_schema}.file_inventory`
                                ) """
            if consolidate_results:
                 select_clause = f" SELECT '{dataset_id}' AS dataset_id, '{dataset_name}' AS dataset_name, '{table}' AS table_name"
            else:
                select_clause = None
            from_clause = f""" FROM `{bq_project}.{bq_schema}.{table}` t0"""
            for column in col_list:
                column_name = column.get("name")
                fileref_idx = column.get("fileref_idx")
                if fileref_idx:
                    if select_clause:
                        select_clause += f", COALESCE(t{fileref_idx}a.file_path, t{fileref_idx}b.file_path) AS {column_name}"
                    else:
                        select_clause = f" SELECT COALESCE(t{fileref_idx}a.file_path, t{fileref_idx}b.file_path) AS {column_name}"
                    from_clause += f" LEFT JOIN load_cte t{fileref_idx}a ON t0.{column_name} = t{fileref_idx}a.file_id"
                    from_clause += f" LEFT JOIN inv_cte t{fileref_idx}b ON t0.{column_name} = t{fileref_idx}b.file_id"
                else:
                    if select_clause:
                        select_clause += f", t0.{column_name}"
                    else:
                        select_clause = f" SELECT t0.{column_name}"
            query = with_clause + select_clause + from_clause
            try:
                df_output = client.query(query).result().to_dataframe()
            except Exception as e:
                print(f"Error pulling data for dataset_id = '{dataset_id}', table = '{table}'. Error: {str(e)}")
                continue

            # Write out results if not consolidating
            if consolidate_results:
                df_results = pd.concat([df_results, df_output], ignore_index=True)
            else:
                output_file_path = f"{table}.tsv"
                df_output.to_csv(output_file_path, index=False, sep="\t")
                !gsutil cp $output_file_path $ws_bucket/ingest_pipeline/misc/metadata_extract/$dataset_name/ 2> stdout
                !rm $output_file_path       
        if not consolidate_results:
            print(f"Results copied to: {ws_bucket}/ingest_pipeline/misc/metadata_extract/{dataset_name}/")

    # Write out results if consolidating
    if consolidate_results:
        df_sorted = df_results.sort_values(["dataset_name", "table_name"], ascending=[True, True], ignore_index=True)
        datetime_string = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
        output_file_path = f"consolidated_extract_{datetime_string}.tsv"
        df_sorted.to_csv(output_file_path, index=False, sep="\t")
        !gsutil cp $output_file_path $ws_bucket/ingest_pipeline/misc/metadata_extract/ 2> stdout
        !rm $output_file_path
        print(f"Results copied to: {ws_bucket}/ingest_pipeline/misc/metadata_extract/{output_file_path}")
    print(f"End time: {datetime.datetime.now()}")
    

#############################################
## Input Parameters
#############################################

# List datasets to pull data from
dataset_id_list = [
    '4124010f-7308-4831-80d7-ea14343249ab',
]

# Tables to include from pull (leave empty for all)
table_inclusion_list = [
    #"file_inventory"
]

# Tables to exclude from pull (will override inclusion list)
table_exclusion_list = [
    "anvil_file", "anvil_donor", "anvil_diagnosis", "anvil_biosample", "anvil_activity", "anvil_antibody", "anvil_variantcallingactivity", "anvil_alignmentactivity", "anvil_project", "anvil_dataset", "anvil_sequencingactivity", "anvil_assayactivity", "workspace_attributes", "file_inventory"
]

# Replace TDR file IDs with original GS paths?
restore_gs_paths = True

# Consolidate results into a single file (may not make sense for multiple tables)?
consolidate_results = False

#############################################
## Execution
#############################################

extract_table_data(dataset_id_list, table_inclusion_list, table_exclusion_list, restore_gs_paths, consolidate_results)


Start time: 2025-02-05 14:13:06.396991
Processing dataset_id = '4124010f-7308-4831-80d7-ea14343249ab'...
	Processing table sample
	Processing table subject
	Processing table qc_result_sample
Results copied to: gs://fc-2a9eefc3-0302-427f-9ac3-82f078741c03/ingest_pipeline/misc/metadata_extract/ANVIL_CCDG_WashU_CVD_PAGE_HMB_NPU_WGS_20221025/
End time: 2025-02-05 14:13:36.658566


## Pull Table Schema from Datasets/Snapshots

In [None]:
#############################################
## Imports
#############################################

import data_repo_client
import google.auth
import pandas as pd
import datetime

#############################################
## Functions
#############################################

# Function to refresh TDR API client
def refresh_tdr_api_client():
    creds, project = google.auth.default()
    auth_req = google.auth.transport.requests.Request()
    creds.refresh(auth_req)
    config = data_repo_client.Configuration()
    config.host = "https://data.terra.bio"
    config.access_token = creds.token
    api_client = data_repo_client.ApiClient(configuration=config)
    api_client.client_side_validation = False
    return api_client

def extract_table_schenas(object_type, object_id_list, output_path):
   
    if object_type in ["dataset", "snapshot"]:
        print(f"Start time: {datetime.datetime.now()}")
        schema_results = []
         # Loop through and process listed objects
        for object_id in object_id_list:

            # Establish API client
            api_client = refresh_tdr_api_client()
            datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
            snapshots_api = data_repo_client.SnapshotsApi(api_client=api_client)

            # Retrieve dataset details
            print(f"Processing {object_type} = '{object_id}'...")
            try:
                if object_type == "dataset":
                    object_details = datasets_api.retrieve_dataset(id=object_id, include=["SCHEMA"]).to_dict()
                    object_name = object_details["name"]
                    object_schema = object_details["schema"]["tables"]
                else:
                    object_details = snapshots_api.retrieve_snapshot(id=object_id).to_dict()
                    object_name = object_details["name"]
                    object_schema = object_details["tables"]
            except Exception as e:
                print(f"Error retrieving object from TDR: {str(e)}")
                print("Continuing to next object.")
                continue
            
            # Parse and record schema details
            for table in object_schema:
                table_name = table["name"]
                if filter_out_fss_tables and "anvil_" in table_name:
                    continue
                else:
                    for column in table["columns"]:
                        column_name = column["name"]
                        schema_results.append([object_type, object_id, object_name, table_name, column_name])
            
        # Format and write out results
        df_results = pd.DataFrame(schema_results, columns=["object_type", "object_id", "object_name", "table_name", "column_name"])
        df_sorted = df_results.sort_values(["object_name", "table_name", "column_name"], ascending=[True, True, True], ignore_index=True)
        results_file = "schema_extract.tsv"
        df_sorted.to_csv(results_file, index=False, sep="\t")
        !gsutil cp $results_file $output_path 2> stdout
        !rm $results_file
        print(f"Results copied to: {output_path}")
        print(f"End time: {datetime.datetime.now()}")    
                 
    else:
        print("Invalid object_type provided. Please specified 'dataset' or 'snapshot' and try again.")
    

#############################################
## Input Parameters
#############################################

# Object type (valid values are 'dataset' or 'snapshot')
object_type = "dataset"

# List objects to extract the schema from
object_id_list = [
    'b12fb9be-2ce0-4bfd-8503-732fabba06ab',
    'd48adc59-8934-41bb-9720-63e71f1933be',
    'e2a398ff-18c3-4258-9d75-89adb2923e88',
    '8e88cabc-e713-44ed-a5d2-41935c3b4eb5',
    'be8cfc23-cd19-46fb-92e1-a77ac380d7aa',
    'f9224ea2-dd31-421d-80d4-f35082ef8d68',
    'e293ce2d-af17-4fb0-a84b-47078830c898',
    'db266afc-2f75-4b03-a3b8-c69e0ce6f713',
    '2cbe079d-e7ab-47d8-836e-454a71440297',
    'dd6866e4-8949-45bd-8910-8ce64f79e3c7',
    '12bbfa4c-c30a-4cf6-b79f-45354f842964',
    '84ac0d05-4be5-43e9-973e-ef999144d802',
    'd306000b-88c1-4220-8d7e-933c0118a983',
    '703c4bc2-81bf-435a-87fa-21dc9278bad6',
    'ff8b1212-858a-4048-8f63-9464c922591a',
    'c814d754-cdc5-4b0d-8671-a39e85b2c473',
    '797b2563-5d56-4f5c-bdaf-3bfd11e8f5b3',
    '85287d84-fefe-40df-ad40-5b135ee0c07f',
    '7eeede5a-c86f-4577-9f3c-65ab618a6dee',
    'ab76b5ca-e464-4063-b949-853f61036370',
    'a52c04ee-cfef-46bb-9b40-6a9b292e1a7b',
    '0194eea9-d779-4957-8521-11717a378e66',
    '0d82658c-44b3-4cea-a388-3353a96a31ef',
    '77dca0d5-4d22-4415-8858-075590d25cb5',
    '84fad495-2756-472f-ad20-f91de6f67baf',
    '1b05159b-6277-4345-9d59-f7bba5ea1d56',
    '92299ff4-c0d0-4e94-b374-75d0038cbd68',
    '60f96582-79ad-4461-9f9a-53c1bc3d17b6',
    '732eaae3-b509-4a7a-8961-09d861e55253',
    'cc107de7-d623-464a-a875-c8b7ae5fb09d',
    'b5d7c34a-c383-4fc7-aa4d-b6dc941cd41a',
    '85baa8f8-619c-4165-9d3e-53220f645814',
    '3f278de3-f201-4344-9639-d35cd7a62adb',
    'c423b18d-12f2-43e4-97f9-993e2943270e',
    '416b8daa-9537-46db-ae7b-3f5ff5f01dc3',
    '61940344-e6c1-484e-ba10-131f43a9b13a',
    'ceee2791-0fdf-45fc-a4e8-8077916771aa',
    '5205f817-7de0-48b4-89fc-6398cf13bff7',
    '1d2f5472-ab6a-4a9b-ba53-520858cf79db',
    'f757278a-3c74-4690-bf89-5149d21ff3af',
    '5a103ab3-29c3-4d07-a0f6-4999c256cf26',
    '672b617f-936e-440a-a735-80f94798aed1',
    '28e73469-12d4-493b-bf6f-83359c1f69c5',
    '2a263db0-8c33-4171-840f-54bf4755a4b9',
    '68ea655f-b4a3-43e2-95e4-f158ca2d67dd',
    '9f9fc99a-b867-49a9-a3dc-8a39efbd5fa2',
    'ce58654d-b7d3-466b-99ba-b203d527a543',
    '179eb85e-2557-4677-9cba-d763310f3df9',
    'cba804c9-0bdd-4219-a53e-98c8db6334a0',
    'd239dd7b-8d10-4960-aa91-8f8ede641e25',
    '19e2c8ab-853a-4204-86c3-f591125fbf63',
    '7cf0d3d0-f79b-4bfe-bfc8-e4e6c33dd4c3',
    '3a72e4b8-afb4-4299-98ec-a9ba9606be06',
    'f3c89298-0dd2-40da-8627-3baea553b34a',
    '9a32e23e-840d-4ba3-8cd9-392f48b8e9d2',
    'c5d967fd-09ce-4b02-97dd-ac3abf6f79fa',
    '5069fc2c-b957-4130-adca-6eabae943867',
    '173e56f7-b813-4c41-89ff-09a824e1407f',
    '5c6a1c4f-ccd3-48a8-ac00-e18e5ecaa0bb',
    '80312f74-bd56-4938-96ba-e9bed95d1f3b',
    '017445d7-d56e-4e2e-b480-b4879b51e944',
    'cb8ebcd0-bb5e-4a6d-bfef-5c651a1a9f6e',
    '9828f3fe-f676-4bf1-b600-5effa24ea9c8',
    '6238f8f7-5efb-4023-8d85-ef7db9b4dad7',
    '32c09444-3d4a-44d5-af6b-07eef92189db',
    '13b2076a-cfe1-49ec-ac61-bad1af9a52ea',
    '516ceb43-1378-4c02-88fc-a1d2a2258d59',
    'a36eeaf7-d6dd-4887-bdbd-e435a07ba156',
    'c9986260-0c1b-4fd3-8132-6fa7353046e6',
    'bda2bec8-a142-47ab-bfb4-83759ac2bddd',
    'e858d4f9-3385-4640-b0cb-4894e86d501c',
    '39fe0c8b-bd78-4565-9415-63eabc1d6d85',
    '28849dc9-a97f-469b-b2ac-a8ff97693f02',
    'da4e904f-0346-4cd3-a5c2-ba932511d98d',
    'c46c2220-da88-4f60-a0cf-eebfd0a8ff12',
    '29cd0578-fb47-495a-8f48-b37325eed81a',
    '629e31cb-dd7b-4345-abf2-fa23c6c65a09',
    'e9a57082-5a93-481a-bbd0-1acb03ac751a',
    'ba503d2e-48af-48bb-910a-be41790d921c',
    '0faf149d-b316-4fbd-8605-a59354f0eacd',
    '472f01ad-7bc3-4fe5-9771-2695930dbc95',
    '96461004-f4b3-4f82-a842-293b3ec46a60',
    '956cd931-7077-4a08-9c75-ab8b4e5d1eb8',
    '1d23d3cc-5db6-4734-bfaa-507dd366d99b',
    'dd58f556-0049-49c3-9a51-d6470a2abddc',
    '841b7883-9447-4ea0-ae4a-84ea0240d919',
    '31a42df0-29f6-4d4e-ae5c-2e13abc355f2',
    '13364604-ed08-4a61-89cd-65eb372ac8c3',
    '53185d06-f2cc-4942-88c4-8534b559a9ff',
    'b724164c-712c-4615-97b7-529a108a753a',
    'd0fc3d6a-c3f4-4533-8a23-817a4e27f9be',
    '22199347-9454-41e5-8912-eb38edd33a25',
    '0132f320-830d-40d0-a4da-06a5d5f9e8d9',
    '60cadee3-9e63-4897-ac81-4fb283033648',
    'bb7d6408-941a-4da6-8613-36498bc6d91b',
    'dcd4112f-09d3-43ed-8441-df9bf4c9ddc8',
    'b8d11ca1-3db8-4efa-bf57-0305e004a26d',
    '9bd56ad6-080e-4d26-acca-83e4df8aa913',
    'da02c3f1-371b-4afe-9b5e-b8c584fd5907',
    '00c11c7e-8530-4bfc-abd7-8c10f4c602d3',
    'f6565f2f-4478-45ad-8c11-04dd242fc6a9',
    '275ea204-4612-4d3c-ac0d-f110f61d62ad',
    '8f6b9e20-9468-4f46-aa45-eeab9de88e53',
    '72f73fc5-6a3a-43a0-8cce-09f4726b736c',
    '272dff18-acf3-4874-a55f-ba8fb6f80352',
    '2cace5dc-f660-45d4-b689-c4c89e77697c',
    '70ac3659-06bb-4022-be55-af81d3e35b6f',
    '409b92cf-5c4d-4997-9736-ef2ea10d19e9',
    'be72f1e3-b5f5-43f4-80db-6d7de93a654e',
    'a8636719-e26c-49b6-9a53-7d77f3d3c94b',
    'b6bf4699-6f61-4c6a-9d42-ad055a0de008',
    'a3e81d5f-8dd6-43dd-9172-d80d212efa2d',
    'd2272f2d-c606-4027-b8ea-0bdd6d9d6535',
    '8abf299c-cd4e-4ce0-b5cf-4f9abe8cc891',
    'd40af129-c13f-45b2-92f0-d0e8fa5cc1c9',
    '0b6eb077-2eca-4fe6-b012-26fab725b907',
    'ecd2d2f9-2b6f-4743-8d04-c9bb554a96cb',
    '5c659e81-e687-4710-a4fd-000ca593155d',
    '9ee78822-7acd-4fab-9999-c58e9fe266ad',
    '42965913-4223-484a-9b3d-abc0002d277d',
    '175dd803-02c7-4823-81d5-9e0621652ace',
    '15492baa-05ed-47bc-b50c-e587679ae51a',
    '33705ce9-b2b3-4edc-9b47-f54283e193cf',
    '92486440-3a46-44dd-b853-b300ef75b31e',
    '72e639d1-b8c5-45fd-9acd-a8e5e2b7fa0d',
    'd6823ccd-7247-4efc-8841-f53f456351ed',
    '1048a860-d5ff-4f61-95e5-851e1266d4c1',
    'd7686f98-05a4-45c9-af2e-3ebc524a5b2d',
    'dd2cb8fc-42a6-482f-898e-ef6125feccb8',
    '1939b7ae-fc6b-42a8-ad5f-dc51a1682a17',
    '8ccefc59-38a5-476f-b7d3-3f98315a97f0',
    '4e99b8e1-40b9-4fb2-90a0-d85e926ef31e',
    '2cda53ba-b852-47e8-8f24-59ab8e9f1d1f',
    '6e67e1e1-5c39-43da-960f-48385789c4e1',
    '128332b6-5060-4ec4-b6a6-f53b54a810be',
    '92382848-f5e9-426c-b7dc-f2841ae97018',
    '4999a410-990e-484b-b4f3-d636f894a741',
    '1f534eb4-701f-4182-9895-64c5e5b52d82',
    '51e9935f-ec18-4832-801a-6d9186537572',
    'd01a4268-1bfe-4a2d-a2d4-e296162c406e',
    'feca4815-b44b-4b2b-8d77-75edd62ba5a6',
    '41cb9f29-4ba6-4690-821c-cb085e6b0f2f',
    '039dd3d6-0cb5-4cd1-86b3-e9579c9b5218',
    'e68d1d39-99df-4cd7-8053-1b298f03eabb',
    'd0ce8b95-9c3b-4f9e-8ce0-169fd89a8b20',
    '7427b2eb-a84f-413c-bfb0-7d2e36b0628f',
    '9d796a02-e2aa-4c15-b8d6-1e90cd736681',
    '28c3df75-0b08-4d5c-9feb-6e2e918572ea',
    '7ea006d9-1e19-4678-b2e6-d4a1ea327f74',
    '433e3a09-661a-46a5-96f2-dbb07bdc87f3',
    'f69c21e9-cb5f-4e72-acfe-c54b672a9f3b',
    '34fd3b22-ac73-47d2-8849-5877158ec072',
    '2ef4530a-cc36-4f32-9a1a-63a555346587',
    '577f36fe-8154-4c82-ac87-b2a64cb68f35',
    '10774229-1487-4188-b2c3-1fabcf85492a',
    '7ce3270e-b2f2-47f4-a288-639751b2f87f',
    '65793118-3c88-4185-9172-2354850e6056',
    '36bdd59f-4f5b-43cd-8d34-a21ef87bbf30',
    'fcb03f4f-e685-4803-aadb-0e8940ff4f37',
    '46536136-08e4-4521-8e6c-67f023de020d',
    '41d12dc1-8718-4439-b409-26cc23573107',
    'c2f0e7cf-ac07-48f7-b5f1-497ee6c134b2',
    'b8e7fe18-9c3d-4cc0-bbc7-85b27197fc8f',
    '3abfc362-7e73-4663-9dcf-07b78b9aa2d4',
    'b60b4737-c646-4299-85a0-520890e830b7',
    'a9ad3a05-24fb-4e59-85b0-ee09e55a4492',
    '280c5d6f-39a3-4d1d-aad2-a174451cd9b2',
    '3725b660-1106-4173-9c4b-0a15926becf5',
    'ecd0e3b1-a177-4487-8e33-0084688cf148',
    '2d434f2c-6aaa-46b2-ada9-de4b887e13d3',
    'bad1fb5c-d263-48d7-8e4c-fa873a17d707',
    'b2b217c2-4b68-4820-bf9d-e2927bfe8706',
    '75fb0984-2124-444f-881b-30a1a6f8b8f7',
    'bafbf771-1cd2-44fc-9b38-5a4bbead8ab2',
    '15be288e-53e1-41cb-8d20-8ea87efb9258',
    '700303c2-fcef-48a5-9900-096bf34e2d83',
    '8b8185d3-ba5c-4832-af23-3ff8ca6ed016',
    '140797da-dc94-4fc2-8b0b-f2e1dec7bd43',
    '8de6dae2-55ff-4287-9b75-5b2a950c1f44',
    'e4ccd185-2b0c-445d-9c57-0dc45c8f9d7e',
    'ae34e63e-13af-48b8-8b72-8137289091b3',
    '56f9888f-e623-4a1a-b2b4-46378a6cd6fe',
    '3fd2204c-8654-4af7-832f-c186447262e0',
    '5488d7c1-5195-4ebc-b0f0-31033fa06dc9',
    'd48db47e-acba-4377-b031-f6dfc21f3658',
    'ec97fa0f-e174-40fe-a6b8-ee240bdf4318',
    '20ddfcd5-d456-431b-9f05-781e05d873d6',
    '575dc7da-58ed-407d-9e88-7b586f28bf65',
    'd3ed2595-b8be-40c8-b7b6-10a4997b9d2e',
    'b2e7f15b-65d5-4812-abfd-b2dbc6d18850',
    '15d41c35-943c-474b-afa6-e1c6d6e4be2b',
    '61803dc8-f649-43e5-ab15-d351f2cef629',
    'abe58d43-e1c7-4953-aa41-4d3b6f6cca44',
    '3ef7966a-ec1e-4dba-9d31-cdb33692e78f',
    'fa278604-7d85-4491-a30d-15c7821f8b00',
    'dd6c6688-b73a-464c-86d9-3369fdf98268',
    'b5c0bf91-9d20-41a2-9dd2-87d0ef0310f9',
    '69f8d7c2-2e14-48e6-b838-7881016313fb',
    '97c636f9-0983-481f-8ff9-7b5b3ee6b10e',
    '15b153f5-ed02-4216-8f96-99743b8b4fc3',
    '747858c0-d139-4f52-9f0e-a618b880d6d6',
    '32bc49c6-7583-4613-a72f-5edb12b2a808',
    '3eb8ea77-4605-4bb7-90f9-671953abe4a2',
    '809fa952-3178-46b7-bb82-8a476ef32e67',
    '2b08cb76-061d-44c6-a00f-b43a5421df5e',
    'dfb14a1f-38b2-4668-b98f-59b5b5b53ca3',
    'ab7e390a-adc5-4f9e-b317-a216a2904c93',
    'f553b765-1c9c-464c-a8fa-07700a1691c5',
    'd1e6d0e4-d49e-4a16-93c6-7956b2c03414',
    'b108dfd0-711d-4bc1-aab5-1b312226c8ad',
    '713f8676-8034-4827-bccc-cd6d95b1a4c4',
    '38eca26c-d79e-4447-99d3-1889d20ade21',
    'b00883d8-9251-435d-aefc-8a703d96d2fb',
    'd5a0e24d-689a-4854-92c7-9a39f980b523',
    'eefbea02-0d65-441e-b455-35aa21d25ba3',
    'ecd0606f-4fa7-4e57-b6e8-eea377e65d5b',
    'ff7e3be2-c0ac-4d97-85da-6229bf7585ac',
    '0c6bc810-5ae0-4926-a56b-2bc2fe7dbe6d',
    '7ac92a42-e112-49c3-a8f5-8ad2c7ef5578',
    '9dc31133-c882-4f39-903a-a25f316bb560',
    '0701aae2-8661-4eec-84e0-7c8be1c89a18',
    '23a0ede0-4f97-46af-9f04-bd2805050980',
    '7593c1c2-3680-4bf5-8a65-dce5f96a3b59',
    '7efb1905-34b4-4f1c-a8a6-8e64b3640a68',
    '11a2b088-8c1c-47d2-9c1e-455d457d2f05',
    '02661394-2886-4ef7-aff1-d53225c82025',
    '74608bd9-39e4-4f48-9b7c-1cd9d3c599c9',
    'c1644d4e-06e2-4fa8-95f1-5c1da5831257',
    '7baf8e8c-de11-452d-b2e1-aad7c08cc18e',
    '267cf516-dd33-4640-a71a-78bd8f5db9d8',
    '23b0219d-0820-4017-b942-bda8578e90e2',
    'd7bcfc5d-e258-4bd6-a413-bb7a118e6bff',
    'a5f631ea-2b4b-43f2-9ea0-e31f2b11fa27',
    'df06ff22-6a2d-4934-aac9-c8368efbea1a',
    '8523489a-f57c-4993-81e4-1ed86a5c092d',
    'dcdefb14-f6de-4c46-ac7e-842b273416bf',
    '395da421-e6e8-4a26-ac93-eb7050a7cb1f',
    '28208cc6-50bf-4864-9a48-981632066640',
    '9cb5ce25-38e8-4628-9ddf-d6aedf5efe0f',
    'ccfe264d-a35e-44f8-9b2b-241a0f8327cc',
    '615f6246-1c39-4e44-a9d4-c7133a2ae62d',
    'f177843d-47fb-46ae-83be-73c92ee85081',
    'a647528d-925e-4c02-8825-ff54720c6ee4',
    'd049d487-1a69-4358-8dad-0e6fa6c06fdd',
    '2c6f63b2-439e-499f-b687-b3fdd88a492e',
    'da29226b-e856-4014-8c8b-c4268d0df2cc',
    '68a916af-2e0c-41bd-8535-c7eacbc2d1b7',
    '714dcdbd-8d17-40b0-8246-0e941af8175d',
    '0e7f31a0-c712-4ebf-ab3a-64c37f43e52a',
    '53ce7d12-facc-4412-a710-f535efb209a2',
    'c8b1d323-f352-482e-bf17-82075c23dcee',
    'b9842819-5fd6-40c5-9668-aae1ea44a308',
    'd30f51c7-d642-4e7d-a168-967b9520a80a',
    '7c056125-3ed8-459c-b73e-edfa3f80cc27',
    '8d89608c-0d61-4d71-a2e3-9fbc6cda69bf',
    'ed1215f1-787c-40f5-9d77-4b5bc2dfbb84',
    '71219f56-551f-4ad4-9a38-cc4aaf8a1e9a',
    '7feeb2b1-1926-4968-b6dc-e0a1e4cf8d4a',
    '48dd6010-77dc-465b-a27c-695e29b57a5e',
    '3376a8b6-7ef6-4191-97ab-a547da0d330d',
    '21384132-1697-4e9b-b863-a6492d13285d',
    'dbf5d87b-4cab-44cd-a792-1d0218aad973',
    '582f5f8d-b96f-490e-b417-ba824baeb06c',
    '85dbde76-c130-40b2-8a8a-ba815ba499da',
    '7e825ee6-7c03-43cc-b0a4-0d9203a30bd9',
    '1c2fe11d-b020-4c54-8c71-1ea91623d626',
    '2843292e-e494-4642-90e0-57e5c153f12c',
    '84133066-68cd-41fa-819b-d74a3ac85862',
    '4ecbb7c8-0246-47f8-9654-4caca1d52565',
    '54c6fa73-9b84-4a3b-9e97-e4e43165c48b',
    '7e3ea1bd-95ba-4cad-90c8-3eec95be9cc8',
    '90ba1853-f845-4502-ba36-b75b9e571bc5',
    'bbba696b-d023-4bb1-a213-c8bee31e8bae',
    'f3c88c3c-8e1b-4af9-9467-0621404e314c',
    '00bd45f9-beb2-4fb0-8680-bd30e392975a',
    'f85e467a-958f-4da5-a01b-8df883e69122',
    '470eee0f-2053-4d9b-9f5e-ca9661a6cc16',
    '332bb145-6ef1-40ef-932c-aec5bb6210d9',
    '5c1dc76d-b703-445c-9b38-cc2d00b9ab16',
    '0b25d09e-b2d9-4452-9810-1d0ef777f9d6',
    '608d793e-a78b-4872-a50c-21a9eaa60ec3',
    '44f83f20-d618-40b5-b2cb-3676b8fe3ad7',
    '74ede771-6781-4980-bfb9-5d853b7cdd6f',
    '3c2c39a9-4cc2-4f7c-89e0-054a871e2c4e',
    '6c47e282-5d5e-445c-b6bd-c0024946fbe0',
    '6ac178b7-a923-407f-8cd8-1733e1b2ebd5',
    'ff8ffbcf-c932-48c7-8d5e-d995d5680e21',
    '2ebb722f-a3df-4ea4-b72a-813e3db0bab5',
    '9d74b4f0-b2d4-46aa-867a-52fb6102bfdf',
    '1817528a-4f88-4ed6-8965-9eae0220ab27',
    'e34f15f7-c225-4314-a638-90504bb0aa0d',
    'e03eb011-05f9-4491-b779-0cc2aefabff1',
    'c1d222ab-bc0e-4e13-8379-0ee5be9e140e',
    'd4bb7169-5a7d-4090-ba62-12ea799c3ade',
    '582187a5-ad63-4759-9162-55fa6337eb07',
    '3f172982-060d-4339-a09b-6994c2c9eb16',
    'd1e8d19a-970d-4ede-b5bc-9cab7237adec',
    '51789659-5233-4ee7-8bca-dedebfc87773',
    'af867604-d801-41cc-9949-017eb30a0cbf',
    'c9dd3578-01db-4687-9807-4f71368941d1',
    '722e332c-fb1a-45fe-80c7-cc670f025b7f',
    'a5fe75bb-d28c-42fb-aaf8-92fa37b266d2',
    '9f152896-ebf1-4756-b678-bdf739a92256',
    '6545d602-e5b4-4dd1-8f6a-64e0a1952ddc',
    '478aa270-fbd4-4a45-8f63-221b4066168e',
    '0b06619d-39d9-4437-8c42-2e415faa634c',
    '12ffb586-5f6a-4f0a-a353-d2f34599f4cc',
    'e9c7ad29-2213-4648-9164-33a07bd42cdb',
    'e5c79b74-20d5-4b6f-8085-0bc788eed2ea',
    'f492567d-6db8-45c8-b44e-6b5def26c812',
    '1d140c76-a06b-42a0-bae8-b9e169ebe394',
    '5edcc3db-c676-412a-9506-600959bb81f2',
    '9f7dbe05-96b5-4b2f-9f3a-34b552e3dd21',
    'ccc524ab-d9ad-467c-a25b-9a14fb05e976',
    '9e3fb02d-dcf6-486f-a42d-89446a852057',
    '4b341ba9-49a5-43a2-9b7e-cc96beb59946',
    '15ae6390-6f6d-4fd8-9a51-ecf988676c4d',
    '3a3100bb-369e-47c1-a77c-2cacb7cf020d',
    '12e54f96-00d9-4a38-921e-e0d42610b2c1',
    '845b131e-7c05-4397-ad40-23dea8e9b399',
    '2c11b505-17c8-402e-8422-0239accb449d',
    'bd492b71-b20e-4056-b8ae-ad8c94cfbc02',
    'e25a8172-1e34-442c-a45d-583027a2d734',
    '0c18589c-6432-4a6c-90ce-985a47a66f39',
    'a6d7e030-e6c8-4c62-8cb5-165ef54987c4',
    '9ecc231f-e3d3-4417-a98a-c4db4c638161',
    'c911503c-f010-4c17-ac57-1d82e954bdc7',
    '3fb2d04a-d18b-4bdc-9372-99b992f2ae42',
    'e922a496-e686-4fa1-911d-2159ceb0f09f',
    '677f0bdf-6c5c-462b-8294-3666f777bbc5',
    '8fbfea50-6a71-4b19-98e9-f95e3a8594c7',
    '34da5c11-bbe8-4e55-8d89-9ef8a1c66200',
    '9a4d9d5f-72aa-4d7d-90f1-6d1181ee984c',
    'd911e57a-ebb8-4be8-876b-d8e5790ddce3',
    '6c9423a2-3ea7-4c3c-9b12-0cc993bc095f',
    '52e015b5-22b7-4a96-9f0a-ea3afccbfcbc',
    '325f3ee8-2adb-4092-bd78-1b5ea5b0d1d6',
    'e6b15b39-daba-431f-a918-e4e43e702c30',
    'ae50ef98-ef3d-4427-b094-83b2d90787a0',
    'e6771964-50e9-482f-9d23-18c22cd89ab8',
    '3fbacc64-4c53-4770-8cdf-a616c10ec5c7',
    'a3ae33bb-8b3a-47e5-a2d1-a49c954776b3',
    '0b0a52bb-a1a2-4638-9259-4447761c2da4',
    '0eb42259-7b44-450f-a9d7-500b2ea7179c',
    'e16adabb-88e0-4739-983a-98ac5c181842',
    '71f94dff-fbe8-4881-af1f-4987b67d5181',
    'bef62e8a-5f5c-4e81-a8f8-ddeaf657b4e8',
    '5627cdbb-22a0-436f-a7a4-34d7ce21bb45',
    '0e65b131-fd14-4fce-908b-c5b89a71a9c1',
    'd56ae233-d6d2-483c-917e-1de0fe1cfeb7',
    'd00353de-f6f9-42d9-8a8f-f88b3d880dbf',
    '3be57453-9325-4c2e-b73a-832139b61778',
    'fcc60ac9-0d20-4a7c-97e4-e3c8d3aa8f76',
    'dc5f85d8-333b-4b68-b160-ad9856233887',
    '655e6a61-5400-4d8a-95bc-1506e026b289',
    '64fd39fc-b32e-4b0a-8f83-4bf11b197462',
    'a77a2c65-38fe-4bf7-9ea6-0a2dc65eb21f',
    '25248cd8-2e98-4a83-9ccf-af7214fa71d6',
    '158ebecd-4596-4541-b832-a137232b7036',
    '1ccb95c3-1901-428e-b7bb-34495f41f4d2',
    '02ff1051-cd1d-4bbb-a005-21384cbff846',
    '0144b0d3-a809-46df-8c67-7ce42bdd579a',
    '35a1009d-93a2-49b1-a801-fe84d6b7a2f5',
    '50132478-c9fb-4dc5-86cd-d5dfab909393',
    '35064fc1-6c52-4005-8e99-cb0d6afd3f8c',
    '5cf859f6-990c-4b04-8609-35d5c57920f0',
    '62cfdce6-2d4d-415c-a11e-5ab60131c668',
    '2d07dd45-a263-440d-a339-9ccbab93aba8',
    'f1513955-0264-4733-bd25-3f752c61a323',
    '93e712f2-3e54-466e-aa53-57eb69c43bc0',
    '296f653a-91a8-4139-9bab-e6ae13afe99c',
    '633dc1aa-084d-43bd-9b17-bc6e57f81d48',
    '9320b3b5-3944-4bd2-913a-23b72bccd86c',
    '86ab4d3b-86ce-422b-ae6f-1ec6968a874d',
    '1f2d14d4-1bd8-46fc-9d35-1a415e5f326a',
    '4124010f-7308-4831-80d7-ea14343249ab',
    '3037caeb-fa7a-4924-b399-7e4c7173b3b8',
    '146b72bc-1dcc-4e3b-bcda-d3dd25418012',
    '1d575e14-c3b1-4ead-a63b-a21c08c6a14d',
    '128dce74-fa37-4f2f-8a80-d542edd81a11',
    '841970b7-bed0-4a75-a28a-a4cc59740a84',
    'a5f53fc8-8f9b-4e9a-af63-6f8c54d478b2',
    'f461fca1-80b2-4980-83a8-e165d49acc18',
    '37f0f1f9-83fb-49a1-9941-093c068c32d0',
    'cfb3dad7-c6d9-47c0-81b0-2133d75f5c0d',
    '0447c960-bbfe-4e42-a95b-dd3d1d9a368e',
    'c5c0893f-b254-4038-8d08-b28ef5a26b5d',
    'bbcf8529-1a04-43fc-b6cf-cb161028159d',
    '06421648-dfcb-4460-b93b-c7d6804dddbb',
    'e0b28b59-1cb5-44f4-ab8f-badf5c74f69f',
    'cc19d19e-6f7e-41b8-87a6-77f41d53e650',
    '631deea0-2821-4d14-ad02-dc0ce4864924',
    '95788aa7-c897-4ae8-9166-4b8fc1fc5342',
    'eede320a-ed63-41d8-960d-5405a26a194f',
    '36dccf81-6932-43ae-9864-53379832d878',
    '9102024d-58c0-4bb9-aa55-12c00d98b6cd',
    '483d3454-54da-4243-bbeb-98cbf1d088d0',
    '01eaf423-8cab-491a-b82e-6915dbc73594',
    '0481a135-9db1-424f-9065-a83ebd7ec995',
    'b60876c5-d825-4303-befb-ffff55b92aba',
    '49022563-1be1-4e42-a11c-01743cd5c94d',
    '64f2dbe1-6f58-493d-ab6b-c93568d828f4',
    'a9626803-72c2-4e23-968c-a090e3f22c5e',
    '095728d6-4ea1-4909-8a74-a8f3fa7f86cb',
    '8309cd89-a912-462a-90ad-f13ae0d7aa6c',
    '902596ce-714e-49b3-8271-f3dfece52309',
    'f5f29e4b-68f7-443c-b290-0827d4167fd5',
    '07c3a7f4-1e59-4dcb-a244-2fd3d084e2b0',
    '24470eb6-97c2-4cd4-b484-87a7d634c5b3',
    '544f643d-b19f-4aa0-a6ec-a90e1a8681d6',
    '5137255f-0c58-4ac7-9266-bda8ab0247c2',
    '6f49717f-8f57-42d0-8548-316ecc292415',
    '7e693091-8ae4-4c40-8e66-c3b39f01b90e',
    '6765ce2d-ebc8-4367-8855-c0f8e62cb355',
    '5243df74-712d-49a8-989b-528d15088e8f',
    '278a26cb-a710-4fff-928e-fc2e7084a75a',
    '58a1d168-8290-4c69-bf01-17ba3a084365',
    '9fc492f3-8d13-47ae-93e9-812c0224f1aa',
    '822d381e-cea0-45bb-8fa0-1b7194b4b64b',
    '2dc01a50-ea7b-4d9b-be57-1ffbdd98b27b',
    '5b6676dc-f46e-43a8-b87c-e431e369e53c',
    'eb35085f-0cbf-4829-a3ad-acaa53a250b5',
    '3a781e70-cf6e-41c2-8d68-2326f16986e7',
    '1b4a324a-5621-4399-85d2-f91aa03418b5',
    '36fa2d20-622d-4cca-80b0-683672c94170',
    '44161b51-953d-4f6b-9448-5cba4a44a9d8',
    '0b90b2ea-8ca3-406a-9f69-95eddf7699ef',
    'bfb202bc-4078-4df0-82b9-9218dbc1f1a1',
    '16031a34-f1ba-4bde-af43-1822f1516944',
    '8cc59f51-b0df-4a5d-a3c5-83ee526ff1af',
    'fda7c4b9-9f35-482b-9eff-be7f11058d94',
    '85646f4a-e424-4363-8033-1e7522e8f175',
    '048afc84-cdd2-4b39-8ea5-7351f4699761',
    '8945794e-174a-49f9-a2d4-4242f9bf3833',
    '529343b4-698a-4b36-ac55-db8a6965ad3f',
    '09642596-d33a-4261-8bf7-eb1dbb37d572',
    '75119ed5-b8aa-4f45-bdef-e3c673bbe44c',
    'cb1f06fa-b916-477d-8ab6-fb4b3f24efd3',
    '04a874df-c57b-40fc-9139-bc3a05129115',
    '2b8ad26a-e66e-4b03-a65a-5b504cecacfd',
    '2f5efaac-9409-4135-8da0-c742024c0653',
    '564df99e-b79d-4217-b608-5b5a6769de4e',
    '0f949ee9-0986-42b2-af5d-0f4c8338c664',
    'a7226f10-bdba-4284-97b3-0738a5912770',
    'aa314675-af62-41df-b5cb-3b22558e903b',
    '93b2ac60-2208-4ef8-a1c2-68a623e45807',
    'a963c15d-9c97-49e4-af95-cdee96333a76',
    '20741062-7d1d-44b7-bc33-39c9ad26e414',
    '69ce1be3-1815-43a4-bdd2-4696d9c8d09a',
    'fb5d9952-ebe7-4ee6-ba00-819ed00f3593',
    '76dd508c-aa80-4e54-9ac4-23b5e0545316',
    '1c6bef41-3cfa-46b2-b183-0a523e417457',
    '18716daf-4223-44a9-bba9-fc9baeef7d07',
    '475430c5-28cb-456d-9c5c-bdbfab9fafb2',
    'f0061cb3-688e-4ad4-aeb8-8614282292ec',
    '6905d8d1-da77-4f7c-86e5-3af7db2b00b4',
    '3a89c170-2939-4c12-9940-f32d96fa9e55',
    'a3ea4f97-6657-4d3c-9be6-96f097f5c952',
    '9a06c401-da3f-41b4-b38b-238796fcae09',
    'b32d88c8-31e3-4789-a75f-e52bf1272937',
    '9f4ac69c-0919-4ac1-98a8-976ed79ace03',
    '2a81cd6f-aa6e-436b-b4ba-68d5f713fb07',
    'efcdb584-7659-4780-9d6d-e6599fb0033c',
    '5e0e8f9a-ce97-4b18-9540-3015c61e393c',
    '1c8ba244-1c7f-433a-825b-d2d34d018dcf',
    '373ff2e8-0f63-4179-a55c-3fe0b85556aa',
    '032d39fb-d278-427d-b7d2-de648a25a20c',
    '352a503b-41eb-4a84-b257-68d70e55337e',
    '737d39b8-2f99-4eac-bcda-a03996e08939',
    '7577f264-8e84-440d-9346-7c4d5affda51',
    'febd8561-4769-4f3b-b7c0-ae7ff6ede2e9',
    'b8c5b185-8669-43d1-8ec7-c0f6d223d505',
    '31e61d00-61cc-46f2-a793-8ea8dfbb0832',
    'cb7dccc5-171c-48bf-9e5e-07bd6f52b34a',
    '279e5670-8a47-4992-bb10-14e6c719db97',
    'd596ee91-481c-4eb5-9a8a-88c1e10ba9b6',
    '64594d5d-0429-4d89-bdf6-2f92dcd19d80',
    'cefc1a79-446c-40d2-b140-ba8d8b1c0712',
    '2355554e-8951-4b41-bcd8-32e18cddb7c9',
    'de1e7762-673f-4d44-8f45-7e693bb338b7',
    '166746e8-ce26-4fa1-a587-443ca9fc59a1',
    'f747ce3a-2f50-4db8-8fdb-5cde4f266110',
    'af6c6f09-f0d2-46fe-bda0-c6fa5901c4a3',
    '3fdcdafd-5328-418b-85f8-47b0006de468',
    'c6f3bd64-ea67-488f-904f-f0bdf6320b5c',
    'ec6f49a2-176c-4564-82c5-e751baab46aa',
    'ac48514d-0b01-4a92-b164-821fa3e05d7a',
    '6d18aafc-0240-499c-902e-a72a5b98ff0a',
    '8681cdcf-b775-4b56-aace-3f3e448261ef',
    'f0db3b27-c952-477d-bc33-9b96a250e168',
    '9e1a6a7d-b45e-4fd2-a1ff-df131da4c713',
    '8da05494-fe7a-4af5-b257-bada143ee426',
    '8b098ab4-df02-4619-8ded-657e496695c1',
    '263ab7c9-bd69-45dd-abb7-bbf35b9786ed',
    'e0c7877e-75d7-47d8-b5e9-5dd677d03353',
    '49a97523-0a7a-4d5a-ae20-496f86de2032',
    '583023a1-aa12-40e2-a964-8ad50ad400ba',
    'c56f0a76-2b91-4860-8dff-63c9504bb0e2',
    '17d3ffb4-e891-4ac6-a91a-fd52971c1115',
    '5203f051-7e84-4969-b4ce-eda56a859793',
    '27acea14-41c9-4bf9-ad43-3ebb3ce90456',
    '868f72af-99e8-406e-9f7e-14577e6c7157',
    '488a38ee-f996-482d-a562-a4474f5594de',
    '4e699ead-bbb5-460d-9b32-2b1b322c601b',
    '680d748c-7c60-46e2-aea5-7fc557a916ea',
    '462d992a-7c13-45ac-a6da-1254fc3a9031',
    'dbb4df81-9115-45d1-b51d-875e0669edc4',
    '6fd0f009-3c34-4529-9a38-c59745545490',
    '74d1e549-5ae8-4410-9428-f8f2cc85fa80',
    '73f7d2b4-86ec-4f7e-a1f9-37c7b023e3bf',
    'ca391820-c11f-46c6-a3ed-e484511f5545',
    'f51b119c-50b3-4954-b489-dffd0c0fc121',
    '254ffffc-2bd6-4b2e-905b-a8c54c348cd0',
    '42152265-490d-4d74-8c94-5b6cde81da5b',
    '3a9604d7-456a-453d-a46b-40408624a07e',
    '595b6755-e7ae-4e83-af2e-693c089aeec3',
    '4d01e12e-503e-4447-8e49-8c2b77ffb00d',
    'd5bdf6a0-f9b4-4e25-a1f3-634374af0727',
    '8c0c984c-71fc-4d8b-8401-0751561fa958',
    'c2f88d90-8463-4ed8-9df6-94659aaf8e7a',
    '9444154f-dd26-4cfe-94d8-0cc5174c6089',
]

# Specify the output GCS path for the results file
output_path = "gs://fc-2a9eefc3-0302-427f-9ac3-82f078741c03/ingest_pipeline/misc/metadata_extract/schema_extract_20250204.tsv"

# Specify whether FSS tables ("anvil_%") should be filtered out of the results
filter_out_fss_tables = True

#############################################
## Execution
#############################################

extract_table_schenas(object_type, object_id_list, output_path)


## Pulling Sample Metadata for CCDG

In [None]:
# Define parameters
billing_profile = "e0e03e48-5b96-45ec-baa4-8cc1ebf74c61"

# Datasets
dataset_id_list = [
    'd6518df9-fc11-46ed-9c12-b9782d3829a0',
    '9ee2a552-89f8-4a48-9c94-9fa26ebb7483',
    '425412ba-894a-4824-acb8-bf18fe4576e0',
    'f22bd762-5c45-453e-bf22-b174514abb84',
    '0ee62643-b064-42f8-9b09-5d10eacd70a3',
    '1a7f6728-5116-4f24-897a-59a7f322cfd2',
    'c37b388c-7107-43d6-bee6-4e82b40ed271',
    'bf6f1d78-6a0d-4afb-aea6-17a3c34340db',
    'a3becdde-018b-46f0-adea-d587076eef4a',
    'a9ad3a05-24fb-4e59-85b0-ee09e55a4492',
    'd56ae233-d6d2-483c-917e-1de0fe1cfeb7',
]

# Establish API client
api_client = utils.refresh_tdr_api_client()
datasets_api = data_repo_client.DatasetsApi(api_client=api_client)

# Loop through enumerated datasets and create records for those related to AnVIL
print(f"Start time: {datetime.datetime.now()}")

df_results = pd.DataFrame(columns = ["dataset_id", "source_workspace", "sample_id", "chip_well_barcode", "collaborator_participant_id", "collaborator_sample_id"])
for dataset_id in dataset_id_list:
    
    # Retrieve dataset details and pull source workspace(s)
    print(f"Processing dataset_id = '{dataset_id}'...")
    dataset_details = datasets_api.retrieve_dataset(id=dataset_id, include=["ACCESS_INFORMATION", "PROPERTIES"]).to_dict()
    try:
        source_workspace = ",".join(dataset_details["properties"]["source_workspaces"])
    except:
        source_workspace = ""
    
    # Pull sample data
    bq_project = dataset_details["access_information"]["big_query"]["project_id"]
    bq_schema = dataset_details["access_information"]["big_query"]["dataset_name"]
    client = bigquery.Client()
    query = f"""SELECT *, '{dataset_id}' AS dataset_id, '{source_workspace}' AS source_workspace,  FROM `{bq_project}.{bq_schema}.sample`"""
    try:
        df_output = client.query(query).result().to_dataframe()
        present_col_list = [col for col in list(df_output.columns.values) if col in ["dataset_id", "source_workspace", "sample_id", "chip_well_barcode", "collaborator_participant_id", "collaborator_sample_id"]]
        df_results = pd.concat([df_results, df_output[present_col_list]], ignore_index=True)
    except Exception as e:
        print(f"Error pulling data for dataset_id = '{dataset_id}'. Error: {str(e)}")
                                                                           
# Sort dataframe records and write out to file
df_sorted = df_results.sort_values(["source_workspace", "dataset_id", "sample_id"], ascending=[True, True, True], ignore_index=True)
output_file_path = "ccdg_cvd_sample_metadata.tsv"
df_sorted.to_csv(output_file_path, index=False, sep="\t")
!gsutil cp $output_file_path $ws_bucket/ingest_pipeline/resources/misc/ 2> stdout
!rm $output_file_path
print(f"End time: {datetime.datetime.now()}")
print(f"Results copied to: {ws_bucket}/ingest_pipeline/resources/misc/{output_file_path}")

## Pulling Subject Metadata for CCDG Afib

In [None]:
# Define parameters
billing_profile = "e0e03e48-5b96-45ec-baa4-8cc1ebf74c61"

# Define table of interest
table_name = "participant"

# List dataset to pull data frome
dataset_id_list = [
    'dd2cb8fc-42a6-482f-898e-ef6125feccb8',
    '51e9935f-ec18-4832-801a-6d9186537572',
    'e68d1d39-99df-4cd7-8053-1b298f03eabb',
    'a08dc7a6-f8ce-4205-95d2-83f614c2c32f',
    '36bdd59f-4f5b-43cd-8d34-a21ef87bbf30',
    'c4c49fcd-0c20-4cff-841a-cb58f5689c5b',
    'd6518df9-fc11-46ed-9c12-b9782d3829a0',
    '9ee2a552-89f8-4a48-9c94-9fa26ebb7483',
    '425412ba-894a-4824-acb8-bf18fe4576e0',
    'f22bd762-5c45-453e-bf22-b174514abb84',
    '0ee62643-b064-42f8-9b09-5d10eacd70a3',
    '1a7f6728-5116-4f24-897a-59a7f322cfd2',
    'c37b388c-7107-43d6-bee6-4e82b40ed271',
    'bf6f1d78-6a0d-4afb-aea6-17a3c34340db',
    'a3becdde-018b-46f0-adea-d587076eef4a',
    'a9ad3a05-24fb-4e59-85b0-ee09e55a4492',
    '719f7581-21db-4aec-8c46-4a5811832710',
    '318a75f4-ac50-4944-81b0-70a1323e7497',
    '75fb0984-2124-444f-881b-30a1a6f8b8f7',
    '15be288e-53e1-41cb-8d20-8ea87efb9258',
    '700303c2-fcef-48a5-9900-096bf34e2d83',
    '38fd20ce-affd-4791-9810-7f5a7fe876d0',
    'd56ae233-d6d2-483c-917e-1de0fe1cfeb7',
    'd7686f98-05a4-45c9-af2e-3ebc524a5b2d',
    '1939b7ae-fc6b-42a8-ad5f-dc51a1682a17',
    '8ccefc59-38a5-476f-b7d3-3f98315a97f0',
    '2cda53ba-b852-47e8-8f24-59ab8e9f1d1f',
    '6e67e1e1-5c39-43da-960f-48385789c4e1',
    '92382848-f5e9-426c-b7dc-f2841ae97018',
    '4999a410-990e-484b-b4f3-d636f894a741',
    '1f534eb4-701f-4182-9895-64c5e5b52d82',
    'd01a4268-1bfe-4a2d-a2d4-e296162c406e',
    'feca4815-b44b-4b2b-8d77-75edd62ba5a6',
    '039dd3d6-0cb5-4cd1-86b3-e9579c9b5218',
    'd0ce8b95-9c3b-4f9e-8ce0-169fd89a8b20',
    '7427b2eb-a84f-413c-bfb0-7d2e36b0628f',
    '9d796a02-e2aa-4c15-b8d6-1e90cd736681',
    '28c3df75-0b08-4d5c-9feb-6e2e918572ea',
    '433e3a09-661a-46a5-96f2-dbb07bdc87f3',
    'f69c21e9-cb5f-4e72-acfe-c54b672a9f3b',
    '2ef4530a-cc36-4f32-9a1a-63a555346587',
    'e917c83d-c482-442d-81ce-869de7d20903',
    '10774229-1487-4188-b2c3-1fabcf85492a',
    '65793118-3c88-4185-9172-2354850e6056',
    'fcb03f4f-e685-4803-aadb-0e8940ff4f37',
    '46536136-08e4-4521-8e6c-67f023de020d',
    'c2f0e7cf-ac07-48f7-b5f1-497ee6c134b2',
    'b8e7fe18-9c3d-4cc0-bbc7-85b27197fc8f',
    '3abfc362-7e73-4663-9dcf-07b78b9aa2d4',
    'b60b4737-c646-4299-85a0-520890e830b7',
    '128332b6-5060-4ec4-b6a6-f53b54a810be',
    '06f05f58-3c83-4f5c-bddd-bed7d2d1d147',
    'd6291444-8c3c-470c-b28c-7cf1d5c7aad8',
    '41d12dc1-8718-4439-b409-26cc23573107',
    '6b40557c-ddc3-4e7e-8a45-1761e7fcb8b5',
    '577f36fe-8154-4c82-ac87-b2a64cb68f35',
    '7ea006d9-1e19-4678-b2e6-d4a1ea327f74',
    '7ce3270e-b2f2-47f4-a288-639751b2f87f',
    '41cb9f29-4ba6-4690-821c-cb085e6b0f2f',
    '34fd3b22-ac73-47d2-8849-5877158ec072',
    '4e99b8e1-40b9-4fb2-90a0-d85e926ef31e',
    'b252e3ac-4a8c-48e0-9999-5ee0c9a5842d',
]

# Establish API client
api_client = utils.refresh_tdr_api_client()
datasets_api = data_repo_client.DatasetsApi(api_client=api_client)

# Loop through enumerated datasets and create records for those related to AnVIL
print(f"Start time: {datetime.datetime.now()}")

df_results = pd.DataFrame()
for dataset_id in dataset_id_list:
    
    # Retrieve dataset details and pull source workspace(s)
    print(f"Processing dataset_id = '{dataset_id}'...")
    dataset_details = datasets_api.retrieve_dataset(id=dataset_id, include=["ACCESS_INFORMATION", "PROPERTIES"]).to_dict()
    try:
        source_workspace = ",".join(dataset_details["properties"]["source_workspaces"])
    except:
        source_workspace = ""
    
    # Pull data
    bq_project = dataset_details["access_information"]["big_query"]["project_id"]
    bq_schema = dataset_details["access_information"]["big_query"]["dataset_name"]
    client = bigquery.Client()
    query = f"""SELECT '{dataset_id}' AS dataset_id, '{source_workspace}' AS source_workspace, * FROM `{bq_project}.{bq_schema}.{table_name}`"""
    try:
        df_output = client.query(query).result().to_dataframe()
        df_results = pd.concat([df_results, df_output], ignore_index=True)
    except Exception as e:
        print(f"Error pulling data for dataset_id = '{dataset_id}'. Error: {str(e)}")
        no_tab_recs = {"dataset_id": [dataset_id], "source_workspace": [source_workspace], f"{table_name}_id": ["TABLE NOT FOUND"]}
        df_no_tab_recs = pd.DataFrame(data=no_tab_recs)
        df_results = pd.concat([df_results, df_no_tab_recs], ignore_index=True)
                                                                           
# Sort dataframe records and write out to file
df_sorted = df_results.sort_values(["source_workspace", "dataset_id", f"{table_name}_id"], ascending=[True, True, True], ignore_index=True)
output_file_path = f"ccdg_afib_{table_name}_metadata_v2.tsv"
df_sorted.to_csv(output_file_path, index=False, sep="\t")
!gsutil cp $output_file_path $ws_bucket/ingest_pipeline/resources/misc/ 2> stdout
!rm $output_file_path
print(f"End time: {datetime.datetime.now()}")
print(f"Results copied to: {ws_bucket}/ingest_pipeline/resources/misc/{output_file_path}")