In [None]:
#!pip install --upgrade data_repo_client

In [1]:
# Imports
import import_ipynb
import ingest_pipeline_utilities as utils
import data_repo_client
from google.cloud import bigquery
from google.cloud import storage
import google.auth
import google.auth.transport.requests
import pandas as pd
import datetime
import os
import re
import time
import requests
ws_name = os.environ["WORKSPACE_NAME"]
ws_project = os.environ["WORKSPACE_NAMESPACE"]
ws_bucket = os.environ["WORKSPACE_BUCKET"]
ws_bucket_name = re.sub('^gs://', '', ws_bucket)

# Display options
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option("display.max_colwidth", None)
pd.set_option('display.width', 1000)
pd.set_option('display.colheader_justify', 'center')
pd.set_option('display.precision', 3)


importing Jupyter notebook from ingest_pipeline_utilities.ipynb
Version 1.0.25: 3/10/2023 8:46am - Nate Calvanese - Turned on the predictable file IDs dataset creation parameter
importing Jupyter notebook from source_files_creation.ipynb
Version 1.0.9: 2/25/2023 3:15pm - Nate Calvanese - Replaced FAPI with utils functions
importing Jupyter notebook from build_file_inventory.ipynb
Version 2.0.0: 3/7/2022 9:32pm - Nate Calvanese - Massive performance improvement with use of gsutil parsing
importing Jupyter notebook from process_table_data.ipynb
Version: 1.0.9: 3/8/2023 12:09pm - Nate Calvanese - Performance improvements for file ref lookups
importing Jupyter notebook from build_mapping_query.ipynb
Version 1.0.12: 2/21/2023 2:50pm - Nate Calvanese - Added support for $BQ_DATASET substitution variable
importing Jupyter notebook from output_data_validation.ipynb
Version 2.0.6: 2/28/2023 11:33am -- Updated notebook to be usable in dev (removed TDR host hardcoding)
importing Jupyter notebook 

# AnVIL Resource Access Control

## Bulk Add Users to Workspaces (and associated Auth Domains)

In [2]:
# Grant my ncalvane account workspace access/auth_domain member access for workspaces
user_role_list = [
    #["user_email", "role - READER, WRITER, OWNER, NO ACCESS"]
     ["ncalvane@broadinstitute.org", "WRITER"]
]
workspace_list = [
'AnVIL_ccdg_asc_ndd_daly_talkowski_AGRE_asd_exome',
'AnVIL_ccdg_asc_ndd_daly_talkowski_brusco_asd_exome',
'AnVIL_ccdg_asc_ndd_daly_talkowski_chung_asd_exome',
'AnVIL_ccdg_asc_ndd_daly_talkowski_goethe_asd_exome',
'AnVIL_ccdg_asc_ndd_daly_talkowski_kolevzon_asd_exome',
'AnVIL_ccdg_asc_ndd_daly_talkowski_lattig_asd_exome',
'AnVIL_ccdg_asc_ndd_daly_talkowski_mcpartland_asd_exome',
'AnVIL_ccdg_asc_ndd_daly_talkowski_menashe_asd_exome',
'AnVIL_ccdg_asc_ndd_daly_talkowski_parellada_asd_exome',
'AnVIL_ccdg_asc_ndd_daly_talkowski_passos-bueno_asd_exome',
'AnVIL_ccdg_asc_ndd_daly_talkowski_persico_asd_exome',
'AnVIL_ccdg_asc_ndd_daly_talkowski_renieri_asd_exome',
'anvil_ccdg_broad_ai_ibd_alm_gmc_wes',
'anvil_ccdg_broad_ai_ibd_daly_chung_gider_wes',
'anvil_ccdg_broad_ai_ibd_daly_lewis_ccfa_wes',
'anvil_ccdg_broad_ai_ibd_daly_mccauley_wes',
'anvil_ccdg_broad_ai_ibd_daly_newberry_share_wes',
'anvil_ccdg_broad_ai_ibd_daly_niddk_cho_wes',
'anvil_ccdg_broad_ai_ibd_daly_xavier_prism_wes',
'AnVIL_CCDG_Broad_AI_IBD_McCauley_WGS',
'anvil_ccdg_broad_ai_ibd_niddk_daly_brant_wes',
'anvil_ccdg_broad_ai_ibd_niddk_daly_duerr_wes',
'anvil_ccdg_broad_ai_ibd_niddk_daly_silverberg_wes',
'AnVIL_CCDG_Broad_CVD_AF_BioVU_HMB_GSO_WES',
'AnVIL_CCDG_Broad_CVD_AF_GAPP_DS-MDS_WES',
'AnVIL_CCDG_Broad_CVD_AF_Swiss_Cases_DS-MDS_WES',
'AnVIL_CCDG_Broad_CVD_Stroke_BRAVE_WGS',
'AnVIL_CCDG_Broad_MI_BRAVE_GRU_WES',
'AnVIL_CCDG_Broad_NP_Epilepsy_AUSAUS_EP_BA_CN_ID_MDS_WES',
'AnVIL_CCDG_Broad_NP_Epilepsy_AUSAUS_EPIL_BA_MDS_WES',
'AnVIL_CCDG_Broad_NP_Epilepsy_AUSRMB_DS-EAED-MDS_WES',
'AnVIL_CCDG_Broad_NP_Epilepsy_BELULB_DS-EP-NPU_WES',
'AnVIL_CCDG_Broad_NP_Epilepsy_CANUTN_DS-EP_WES',
'AnVIL_CCDG_Broad_NP_Epilepsy_CYPCYP_HMB_NPU_MDS_WES',
'AnVIL_CCDG_Broad_NP_Epilepsy_DEUPUM_HMB_MDS_WES',
'AnVIL_CCDG_Broad_NP_Epilepsy_DEUUKB_HMB_NPU_MDS_WES',
'AnVIL_CCDG_Broad_NP_Epilepsy_DEUUKL_HMB_WES',
'AnVIL_CCDG_Broad_NP_Epilepsy_DEUUTB_HMB-NPU-MDS_WES',
'AnVIL_CCDG_Broad_NP_Epilepsy_FINKPH_EPIL_CO_MORBIDI_MDS_WES',
'anvil_ccdg_broad_np_epilepsy_fralyu_hmb_wes',
'AnVIL_CCDG_Broad_NP_Epilepsy_GBRSWU_CARDI_NEURO_WES',
'AnVIL_CCDG_Broad_NP_Epilepsy_HKGHKK_HMB_MDS_WES',
'AnVIL_CCDG_Broad_NP_Epilepsy_HKOSB_GRU_WES',
'AnVIL_CCDG_Broad_NP_Epilepsy_IRLRCI_GRU_IRB_WES',
'AnVIL_CCDG_Broad_NP_Epilepsy_ITAICB_HMB_NPU_MDS_WES',
'AnVIL_CCDG_Broad_NP_Epilepsy_ITAIGI_GRU_WES',
'AnVIL_CCDG_Broad_NP_Epilepsy_ITAUBG_DS_EPI_NPU_MDS_WES',
'AnVIL_CCDG_Broad_NP_Epilepsy_ITAUMC_DS_NEURO_MDS_WES',
'anvil_ccdg_broad_np_epilepsy_lebabm_ds_epilepsy_wes',
'AnVIL_CCDG_Broad_NP_Epilepsy_LEBABM_GRU_WES',
'AnVIL_CCDG_Broad_NP_Epilepsy_NZLUTO_EPIL_BC_ID_MDS_WES',
'AnVIL_CCDG_Broad_NP_Epilepsy_USABCH_EPI_MUL_CON_MDS_WES',
'AnVIL_CCDG_Broad_NP_Epilepsy_USACCF_HMB-MDS_WES',
'AnVIL_CCDG_Broad_NP_Epilepsy_USAHEP_GRU_WES',
'AnVIL_CCDG_Broad_NP_Epilepsy_USAMON_GRU_GSRS_WES',
'AnVIL_CCDG_Broad_NP_Epilepsy_USAMON_GRU_NPU_WES',
'AnVIL_CCDG_Broad_NP_Epilepsy_USAMON_GRU_WES',
'AnVIL_CCDG_Broad_NP_Epilepsy_USAUPN_Marsh_GRU_NPU_WES',
'AnVIL_CCDG_Broad_NP_Epilepsy_USAUPN_Marsh_GRU_WES',
'AnVIL_CCDG_Broad_NP_Epilepsy_USAUPN_Rader_GRU_WES',
'anvil_ccdg_nygc_np_autism_cag_ds_wgs',
'AnVIL_CCDG_NYGC_NP_Autism_GASD_GRU_WGS',
'AnVIL_CCDG_NYGC_NP_Autism_HFA_DS_WGS',
'AnVIL_CCDG_NYGC_NP_Autism_PELPHREY_ACE_DS_WGS',
'AnVIL_CCDG_NYGC_NP_Autism_PELPHREY_ACE_GRU_WGS',
'anvil_ccdg_nygc_np_autism_searchlight_ds_wgs',
'AnVIL_CCDG_NYGC_NP_Autism_SPARK_GRU_WGS',
'AnVIL_CMG_Broad_Muscle_OGrady_WES',
'AnVIL_NIMH_Broad_ConvergentNeuro_McCarroll_Eggan_CIRM_GRU_WGS',
]

for user_role in user_role_list:
    user = user_role[0]
    role = user_role[1]
    print(f"Processing ACL updates for user: {user}")
    results = []
    for workspace in workspace_list:

        # Initialize
        print(f"\tProcessing ACL updates for {workspace}.")
        error_list = []

        # Establish credentials
        creds, project = google.auth.default()
        auth_req = google.auth.transport.requests.Request()
        creds.refresh(auth_req)

        # Add user as writer on workspace
        payload = [{
            "email": user,
            "accessLevel": role,
            "canShare": True,
            "canCompute": True
        }]
        response = requests.patch(
            url=f"https://api.firecloud.org/api/workspaces/anvil-datastorage/{workspace}/acl",
            headers={"Authorization": f"Bearer {creds.token}"},
            json=payload
        )
        if response.status_code != 200:
            error_list.append("Error adding to workspace ACL")

        # Pull workspace attributes
        ws_attributes = requests.get(
            url=f"https://api.firecloud.org/api/workspaces/anvil-datastorage/{workspace}?fields=workspace.attributes,workspace.authorizationDomain,workspace.googleProject,workspace.bucketName",
            headers={"Authorization": f"Bearer {creds.token}"}
        ).json()

        # Add user to auth domains
        for ad in ws_attributes["workspace"]["authorizationDomain"]:
            auth_domain = ad["membersGroupName"]
            response = requests.put(
                url=f"https://api.firecloud.org/api/groups/{auth_domain}/member/{user}",
                headers={"Authorization": f"Bearer {creds.token}"}
            )
            if response.status_code != 204:
                error_list.append(f"Error adding to auth domain ({auth_domain})")

        # Record status
        status = "Success" if not error_list else "Failure"
        error_str = "; ".join(error_list)
        results.append([workspace, status, error_str])

    # Display results
    print(f"\nResults for user: {user}")
    results_df = pd.DataFrame(results, columns = ["workspace", "status", "errors"])
    display(results_df)
    


Processing ACL updates for user: ncalvane@broadinstitute.org
	Processing ACL updates for AnVIL_ccdg_asc_ndd_daly_talkowski_AGRE_asd_exome.
	Processing ACL updates for AnVIL_ccdg_asc_ndd_daly_talkowski_brusco_asd_exome.
	Processing ACL updates for AnVIL_ccdg_asc_ndd_daly_talkowski_chung_asd_exome.
	Processing ACL updates for AnVIL_ccdg_asc_ndd_daly_talkowski_goethe_asd_exome.
	Processing ACL updates for AnVIL_ccdg_asc_ndd_daly_talkowski_kolevzon_asd_exome.
	Processing ACL updates for AnVIL_ccdg_asc_ndd_daly_talkowski_lattig_asd_exome.
	Processing ACL updates for AnVIL_ccdg_asc_ndd_daly_talkowski_mcpartland_asd_exome.
	Processing ACL updates for AnVIL_ccdg_asc_ndd_daly_talkowski_menashe_asd_exome.
	Processing ACL updates for AnVIL_ccdg_asc_ndd_daly_talkowski_parellada_asd_exome.
	Processing ACL updates for AnVIL_ccdg_asc_ndd_daly_talkowski_passos-bueno_asd_exome.
	Processing ACL updates for AnVIL_ccdg_asc_ndd_daly_talkowski_persico_asd_exome.
	Processing ACL updates for AnVIL_ccdg_asc_nd

Unnamed: 0,workspace,status,errors
0,AnVIL_ccdg_asc_ndd_daly_talkowski_AGRE_asd_exome,Success,
1,AnVIL_ccdg_asc_ndd_daly_talkowski_brusco_asd_exome,Success,
2,AnVIL_ccdg_asc_ndd_daly_talkowski_chung_asd_exome,Success,
3,AnVIL_ccdg_asc_ndd_daly_talkowski_goethe_asd_exome,Success,
4,AnVIL_ccdg_asc_ndd_daly_talkowski_kolevzon_asd_exome,Success,
5,AnVIL_ccdg_asc_ndd_daly_talkowski_lattig_asd_exome,Failure,Error adding to auth domain (AUTH_AnVIL_ccdg_asc_ndd_daly_talkowski_lattig_asd_exome)
6,AnVIL_ccdg_asc_ndd_daly_talkowski_mcpartland_asd_exome,Failure,Error adding to auth domain (AUTH_AnVIL_ccdg_asc_ndd_daly_talkowski_mcpartland_asd_exome)
7,AnVIL_ccdg_asc_ndd_daly_talkowski_menashe_asd_exome,Success,
8,AnVIL_ccdg_asc_ndd_daly_talkowski_parellada_asd_exome,Success,
9,AnVIL_ccdg_asc_ndd_daly_talkowski_passos-bueno_asd_exome,Success,


# TDR Reader Management

## Remove Undesired Readers from TDR Datasets

In [None]:
# Function to remove erroneous readers from snapshot
def clean_up_ad_readers(snapshot_id, readers):
    print("Cleaning up readers for {}...".format(snapshot_id))
    reader_list = readers
    api_client = utils.refresh_tdr_api_client()
    snapshots_api = data_repo_client.SnapshotsApi(api_client=api_client)
    # Retrieve snapshot, grab auth_domain
    if '$AUTH_DOMAIN' in reader_list:
        snapshot_response = snapshots_api.retrieve_snapshot(id=snapshot_id)
        snapshot_name = snapshot_response.name
        print("Snapshot name: {}".format(snapshot_name))
        try:
            auth_domain_list = snapshot_response.source[0].dataset_properties["auth_domains"]
        except:
            auth_domain_list = []
        for ad in auth_domain_list:
            reader_list.append(ad + "@firecloud.org")

    # Retrieve snapshot policies and delete readers that aren't in reader list
    snapshot_policy_response = snapshots_api.retrieve_snapshot_policies(id=snapshot_id)
    delete_count = 0
    for policy in snapshot_policy_response.policies:
        if policy.name == "reader":
            for policymember in policy.members:
                if policymember not in reader_list:
                    api_client = utils.refresh_tdr_api_client()
                    snapshots_api = data_repo_client.SnapshotsApi(api_client=api_client)
                    retry_count = 0
                    while retry_count < 1:
                        try:
                            delete_response = snapshots_api.delete_snapshot_policy_member(id=snapshot_id, policy_name="reader", member_email=policymember)
                            delete_count += 1
                            break
                        except:
                            retry_count += 1
                        
    # Print results
    snapshot_policy_response = snapshots_api.retrieve_snapshot_policies(id=snapshot_id)
    print(f"\t{delete_count} erroneous readers deleted.")
    
    for role in snapshot_policy_response.policies:
        if role.name == "reader":
            rem_readers = ", ".join(role.members)
            print(f"\tRemaining readers: {rem_readers}")
    return 

# Clean-up snapshots
reader_list = ["azul-anvil-prod@firecloud.org"]#, '$AUTH_DOMAIN']
snapshot_id_list = [
'b0fc6253-d274-4e53-9977-85d943116f7c',
]
for snapshot_id in snapshot_id_list:
    clean_up_ad_readers(snapshot_id, reader_list)


## Add Auth Domain Users to TDR Datasets

In [None]:
# Function to remove add readers to snapshot --> TO BE DONE WHEN ADs NEED TO BE ADDED BACK


# Snapshot Row Count Collection

In [None]:
def return_row_counts(snapshot_id, results_list):
    # Grab access information from schema
    api_client = utils.refresh_tdr_api_client()
    snapshots_api = data_repo_client.SnapshotsApi(api_client=api_client)
    try:
        response = snapshots_api.retrieve_snapshot(id=snapshot_id, include=["TABLES", "ACCESS_INFORMATION"]).to_dict()
        tdr_schema_dict = {}
        tdr_schema_dict["tables"] = response["tables"]
        bq_project = response["access_information"]["big_query"]["project_id"]
        bq_dataset = response["access_information"]["big_query"]["dataset_name"]
    except:
        results_list.append([snapshot_id, 0])
        return results_list
    
    # Build row count query
    table_set = set()
    table_count = 0
    row_count_subquery = ""
    for table_entry in tdr_schema_dict["tables"]:
        table_set.add(table_entry["name"])
    for table_entry in table_set:
        table_count += 1
        if table_count == 1:
            row_count_subquery += "SELECT datarepo_row_id FROM `{project}.{dataset}.{table}` ".format(project=bq_project, dataset=bq_dataset, table=table_entry)
        else:
            row_count_subquery += "UNION ALL SELECT datarepo_row_id FROM `{project}.{dataset}.{table}` ".format(project=bq_project, dataset=bq_dataset, table=table_entry)
    row_count_query = "SELECT COUNT(*) AS row_count FROM ({subquery})".format(subquery=row_count_subquery)
    
    # Execute query and write results to results dict
    try:
        client = bigquery.Client()
        df_results = client.query(row_count_query).result().to_dataframe()
        row_count = df_results["row_count"].values[0]
        results_list.append([snapshot_id, row_count])
    except:
        results_list.append([snapshot_id, 0])
    return results_list
    
# Loop through snapshots and collect row counts
results_list = []
snapshot_id_list = [
'bb7eaad8-b02c-455c-964d-c9242019d9e5',
]
for snapshot_id in snapshot_id_list:
    results_list = return_row_counts(snapshot_id, results_list)
    
# Convert results to dataframe and display
results_df = pd.DataFrame(results_list, columns = ["snapshot_id", "row_count"])
display(results_df)


# Pulling Dataset Sizes Across AnVIL

## Pulling file counts and sizes from TDR

In [None]:
# Define parameters
billing_profile = "e0e03e48-5b96-45ec-baa4-8cc1ebf74c61"

# Establish API client
creds, project = google.auth.default()
auth_req = google.auth.transport.requests.Request()
creds.refresh(auth_req)
config = data_repo_client.Configuration()
config.host = "https://data.terra.bio"
config.access_token = creds.token
api_client = data_repo_client.ApiClient(configuration=config)
api_client.client_side_validation = False
datasets_api = data_repo_client.DatasetsApi(api_client=api_client)

# Loop through enumerated datasets and create records for those related to AnVIL
print(f"Start time: {datetime.datetime.now()}")
records_list = []
datasets_list = datasets_api.enumerate_datasets(limit=2000)
for dataset_entry in datasets_list.items:
    if dataset_entry.default_profile_id == billing_profile:
        # Retrieve dataset details and pull source workspace(s)
        dataset_details = datasets_api.retrieve_dataset(id=dataset_entry.id, include=["ACCESS_INFORMATION", "PROPERTIES"]).to_dict()
        try:
            source_workspace = ",".join(dataset_details["properties"]["source_workspaces"])
        except:
            source_workspace = ""
        
        # Pull data file size sum from BigQuery
        bq_project = dataset_details["access_information"]["big_query"]["project_id"]
        bq_schema = dataset_details["access_information"]["big_query"]["dataset_name"]
        client = bigquery.Client()
        file_size_query = """SELECT COUNT(*) AS file_count, COALESCE(SUM(size_in_bytes),0) AS file_size FROM `{project}.{schema}.file_inventory`""".format(project = bq_project, schema = bq_schema)
        try:
            df_output = client.query(file_size_query).result().to_dataframe()
            file_count = df_output["file_count"].values[0]
            byte_size = df_output["file_size"].values[0]
            status = "Success"
        except:
            file_count = 0
            byte_size = 0
            status = "Error"
    
        # Build record for dataset
        record = [dataset_entry.id, dataset_entry.name, source_workspace, file_count, byte_size, status]
        records_list.append(record)
        
# Read records into a dataframe
df = pd.DataFrame(records_list, columns =["Dataset UUID", "Dataset Name", "Source Workspaces", "File Count", "File Size (Bytes)", "Retrieval Status"])
df["File Size (Bytes)"] = df["File Size (Bytes)"].astype(int).astype(str)
df_sorted = df.sort_values(["Source Workspaces"], ascending=[True], ignore_index=True)
print(f"End time: {datetime.datetime.now()}")
display(df_sorted)

## Pulling file counts and sizes from WS Buckets

In [None]:
# List of buckets:
bucket_list = [
'real_mock_data',
'fc-secure-f7f06a5e-1c5f-4d2e-b195-dd668d9d4b55',
'fc-secure-f6d7a7f7-c215-417b-a6d3-59eaec5af8bc',
'fc-secure-f5d884c0-a24c-46e6-8c29-cad7f5b158c7',
'fc-secure-be0617df-a95e-468c-a83d-37b1c4fae931',
'fc-secure-5a7daa8b-9c5d-4b8d-a312-85e6c8af0197',
'fc-secure-ced72bfc-ed3b-4ffe-838c-0d4ccc7abfd1',
'fc-secure-8976991f-e383-43db-86b5-3d1b15af0a51',
'fc-secure-6cb5e032-ae16-464a-859b-56384e40f1a3',
'fc-secure-585ef3f0-76a9-40ae-8ed3-a47190c3148f',
'fc-secure-a6c8533c-8111-4986-8621-0b6c633deff6',
'fc-secure-b94a3f83-a0ad-4850-a05a-8736eb893577',
'fc-secure-ffc72fe8-aa14-4a33-81a8-6bd0f269f834',
'fc-secure-68d06583-108f-44d3-bf2c-42fbb1952d45',
'fc-secure-1d07990d-3332-4233-ad22-18d701ae7ead',
'fc-secure-2ecd823d-d20c-453d-a798-7fe21c195be7',
'fc-secure-af8f60c0-6264-407d-ac9d-2da74d7f8a57',
'fc-secure-a4bc968d-87fa-46d8-8716-76f70e5c6415',
'fc-secure-0ac23157-c61b-47c0-9567-a5296e124a84',
'fc-secure-d55d1c0c-b1e3-42e9-b452-20a81be1f84a',
'fc-secure-4984306f-6ceb-48a0-87d5-a6c4ef499867',
'fc-secure-b4316f22-8920-4b86-a8cd-2a4a7b356f0d',
'fc-secure-afbf7a91-d826-4227-893f-f4a0dbee5cfa',
'fc-secure-386d1d06-75c7-4683-b1ca-62e0d7ac3700',
'fc-secure-4c6f587f-be8c-4bb8-9053-7a85ed68ed3e',
'fc-secure-1e391fda-00d2-4dc9-94f2-845e59d57117',
'fc-secure-ed4bfb0f-b490-4523-bb2d-af58735df6ab',
'fc-secure-1fb85b31-9a1e-46ef-a206-41040d151f94',
'fc-secure-97f7b2db-e05f-4ac4-af27-6156169ca00a',
'fc-secure-2085a664-58a9-404c-ae57-210fb19806fb',
'fc-secure-2c4bf327-7583-4aee-b2c2-4575abdea4ca',
'fc-secure-20d7c83f-35a7-4b7e-9a1b-c40bfdff29b2',
'fc-secure-10b393b7-8607-4999-b1c5-18b9aabb49d6',
'fc-secure-ad9305f6-4693-4b75-a64a-e9317b40abac',
'fc-secure-d51e93aa-adc1-4628-a6f2-e0dd1f0900c8',
'fc-secure-cbf1f8fb-8185-46c9-9034-63073cbe7be7',
'fc-secure-09e06b8a-30e8-47a8-8faa-ec7201bc120d',
'fc-secure-1b2f65a3-61f8-4ce1-93ad-65b690c1f043',
'fc-secure-e14b3e8e-156e-4264-b8f2-43664206ca0c',
'fc-secure-86684396-82a0-4f1b-a918-b90b039b922c',
'fc-secure-2f25349b-e646-426d-8aa7-3a7e779d70fa',
'fc-secure-588b9330-ec2f-4652-86cd-f6f57b4aecbf',
'fc-secure-21c6905e-06c8-45f2-b6ed-ffba467f7f75',
'fc-secure-c327926b-a5aa-4b40-93b8-e092fa116e88',
'fc-secure-285cdab6-0292-4c49-b22f-5232df22e9a7',
'fc-secure-b6e37dad-6d9c-49b3-ba96-042a7de9f47c',
'fc-secure-f26bf965-6554-4da5-ac6d-d04ea7d604b7',
'fc-secure-f20281f0-d586-432c-b790-b44eb5098d01',
'fc-secure-9472fed0-5f8d-4a1c-a778-9ae44d90462d',
'fc-secure-d24a56f9-7a81-48b7-b1be-1d0e0ae0ac06',
'fc-secure-203efee5-de46-4cb1-9374-3c5be56e5a5e',
'fc-secure-c7ffba40-1140-4c5e-8336-8032b7ec5f9e',
'fc-secure-4c1fdb5d-53b0-4a2f-88bd-da63817348d7',
'fc-secure-7248d8b7-e064-487c-9514-28a7cf0e271c',
'fc-secure-4a5440e0-1b04-42c6-a1b4-e8249a578727',
'fc-secure-ad3b9ebd-9a6e-490a-81a0-bda548231ed8',
'fc-secure-ec3a8ac1-4929-4338-8a2f-487c6fa6a218',
'fc-secure-22693da1-7a53-4115-9b93-98856c7c1ca8',
'fc-secure-a8d262fb-151c-4c3d-ac09-7957a168fd5e',
'fc-secure-709943c7-62cd-4ba8-9e19-77cdae337f5d',
'fc-secure-d4a884d6-418a-4c64-a96e-bdb0ac2ff248',
'fc-secure-c3d739f2-7c47-4d5f-b273-1f426677cfde',
'fc-secure-f61fa64e-4d50-4a54-af57-97bf4f92cdb3',
'fc-secure-f012c96e-3e1b-4a9a-800b-bde7ed486f98',
'fc-secure-0b19bbb9-481f-404d-a4a6-3024b7c4b698',
'fc-secure-c5377331-3721-4e3b-b8d0-03715d9b428a',
'fc-secure-3824ddf8-5512-4dee-bdd5-b17ddc111ca7',
'fc-secure-c53f7f30-6128-4fbc-8ae4-2559a8dbc752',
'fc-secure-3aa171b5-8208-4467-8743-c7879f3da6da',
'fc-secure-07fcf1c8-7442-4047-826b-8fe723325ba2',
'fc-secure-8e933d1a-d546-4267-967f-bc00d03f4880',
'fc-secure-ec1e8941-0ccb-41db-bc39-1ddf558a06c2',
'fc-secure-e66447ae-c1b8-4a5b-a3e6-2bed1ca59d8f',
'fc-secure-e40316e1-165b-44ae-82cb-7136c37aacdb',
'fc-secure-a941f3d7-5afc-4886-a4ab-c9d6a6459b1f',
'fc-secure-215c5ec0-940a-4808-9353-667f3a3af3c3',
'fc-secure-19ae92b3-86a1-4ec5-a8dc-f9d1ea226c4c',
'fc-secure-2011b97c-a9c9-4a13-8911-f3833be31253',
'fc-secure-25f8efc0-42fc-48d5-9a39-53c131c4d6ef',
'fc-secure-87fff435-fd01-4bd9-91b4-8670f7b19162',
'fc-secure-1a841909-2e43-4c83-8b66-ba15aceecfee',
'fc-secure-0aabe38b-bfbd-40ee-83fa-23e70329be00',
'fc-secure-65eb240a-fda1-4c9b-90ff-2476751bc967',
'fc-secure-2513a28a-aff0-422a-b650-9c05fa6800c3',
'fc-secure-bae8180f-5cb1-4f57-b480-79ba9cfdd90e',
'fc-secure-3003014b-05be-4ee1-8aa5-cc10e9a51293',
'fc-secure-7c502c02-435a-4cc6-a9bc-b2406d8079be',
'fc-secure-d6b2ec0c-a216-4bc7-b1b3-be527db39d9a',
'fc-secure-0c8958a9-8b70-421e-81a5-a30a3a332e55',
'fc-secure-0e71f746-0db3-4ff7-ba65-4a454d8ff0e1',
'fc-secure-286b80a7-cd88-4feb-bc4b-b497c13b7fa2',
'fc-secure-d9fd78a9-5fd1-41e7-85b4-10c32f5149e1',
'fc-secure-1114cae3-387f-4277-8a1f-0b912487b224',
'fc-secure-756b92cc-ad23-41f2-8566-3620b1016886',
'fc-secure-c96f6079-de25-4367-ae8b-1676f1898309',
'fc-secure-2e284171-99ef-45b5-9261-b99a9208b241',
'fc-secure-b39f3519-3601-46b6-b7bc-dedddbfd7306',
'fc-secure-6513d7e1-2dbb-41a2-baea-3f7fdbcbb620',
'fc-d3e9eb24-cb19-47d8-b2c6-d85fd34b4ff1',
'fc-0ed1ef2d-1039-4c8a-a0a9-91c3e385200a',
'fc-282a8e0b-df88-42de-9059-2b7447d9f9c7',
'fc-secure-5efb4966-0994-41f8-a911-1d159c9bae1b',
'fc-2836a560-113a-4239-acab-5cce58019b73',
'fc-bb71bb7a-fdb1-427a-9e56-eb08b6fd7955',
'fc-secure-e9b2e26a-3f73-4f5a-862f-c5b3be68703f',
'fc-e7051891-25c8-4776-80ed-26b1af860277',
'fc-4f070061-0bc2-4f9a-9fe9-869a739c9817',
'fc-c1701683-c10e-4f73-a636-f774e8b650c2',
'fc-secure-ccca1171-d3ee-42b3-8df8-aca336279cf3',
'fc-2d61b7df-571f-4201-a674-1107c84711df',
'fc-5d7cf59f-e361-4073-a6ad-16d8d78cc613',
'fc-secure-70487b95-e89c-45ec-ad0a-e5382d625c33',
'fc-2b68ae78-57af-4c65-8020-6f5ed4ae9408',
'fc-secure-33bdfbdb-de58-474e-8591-dad501aa1995',
'fc-2bcebe36-5d83-486a-947a-bbb5a606701d',
'fc-secure-ead0ff8d-eee9-4299-bb54-8404ffe9fa22',
'fc-secure-d8de1fe3-972d-480f-a8a8-2bbc251add30',
'fc-secure-4d47049d-9a31-435d-8c97-61cffce9a83b',
'fc-secure-31d85e96-7fa0-4c2e-a89a-fe5c70845fd7',
'fc-secure-68ff7cc9-274c-45d6-baad-75b9c5971a9c',
'fc-secure-fdaa7a52-520b-461b-a2d2-e31bf92e8e86',
'fc-8da94069-2edc-4e37-8c96-5a25740aeb32',
'fc-secure-9c348df7-4da1-428a-a785-e06db3a9f208',
'fc-secure-0de89e54-2149-4e06-81f9-da5af48c68a3',
'fc-secure-986229e0-ac72-420d-bf0e-aa14dea63a05',
'fc-18d7b17f-0ab3-4ab1-b730-61402c7ab4d0',
'fc-secure-4931149d-9e71-4865-9f41-3e4c998ffb38',
'fc-82bbaf50-f3d4-48e9-bd76-3874638fa714',
'fc-fd0450ff-f7c0-42ed-9254-cbbbdad1ed41',
'fc-secure-13597242-de35-44e2-b8fb-b5fa0b983501',
'fc-6ffb13ac-bced-48d9-9a61-b0803de3ba77',
'fc-a9e7890c-3902-4647-8b82-273490a7ce54',
'fc-secure-75f95e44-299f-4666-bed3-46dd679b12d8',
'fc-secure-240e1629-6d73-42ab-a373-1abeec17824c',
'fc-secure-94c90c12-376d-419a-96d9-ed37e1b1a5bb',
'fc-secure-6c21e787-1a4b-4235-b756-9ce6096fc815',
'fc-secure-51198b17-37ae-44b7-8513-c11c4bfe3a9d',
'fc-secure-59794551-d924-4ad7-905b-8727646d9aad',
'fc-secure-8ec82876-176f-4f33-ae98-0a3cae871ed4',
'fc-secure-8a282388-3c56-48c6-99c8-ea4b52c053b9',
'fc-secure-3e71d768-9da9-4845-9e2c-7e909db92cb7',
'fc-secure-0fc5a889-f57e-40a1-9859-c5b1e8a196d1',
'fc-secure-0f948ad2-2ae8-433c-9f0c-941c4c5e4a89',
'fc-secure-678eccb8-3463-4a72-8b57-69dfc8c77002',
'fc-secure-221b863c-a724-42f3-9f90-2081b352799c',
'fc-secure-21cd882f-8470-4c2e-93dc-536a908bae73',
'fc-secure-e92b8081-5e6a-440c-af83-4d428f505529',
'fc-secure-e0034430-99a3-4dde-99d3-a2330cd90f19',
'fc-secure-315a127f-649d-4928-b4e0-cdca7d898e05',
'fc-secure-550ffe2e-04fd-4763-b7d2-09f0c59083e4',
'fc-secure-84e57da6-4df9-45de-9f82-8a550887a7fa',
'fc-secure-2b4d5d05-d951-4e51-8ece-7e851660f91a',
'fc-secure-bfe6497b-69a1-4917-8a7b-c9bd36cb4ae4',
'fc-secure-3b588f92-0298-4ad6-b75d-fa16de8b718d',
'fc-secure-fe950bf8-0470-4329-b8c9-8a42d0dd619d',
'fc-secure-59d2af1f-3dc0-407b-b7ab-05cdcfa4da8f',
'fc-secure-01106611-a0e9-41bb-ac13-27683ab2fc19',
'fc-secure-7a160245-84eb-4383-80ed-f41c2411e702',
'fc-secure-32a2f8aa-4f72-43e9-9450-bbf661bde5ef',
'fc-secure-ce2baa61-748a-4dbc-a929-f256721b59b2',
'fc-secure-ac202043-c5ef-4fb7-8ccb-62a274c1b8ec',
'fc-secure-652024de-0ecd-4de3-8360-c8c5bfcafd72',
'fc-secure-7c845669-3781-4ac0-bb59-1495d68d1d85',
'fc-secure-330f768f-83c4-4570-ae46-0626b477d2b0',
'fc-secure-dccff364-c2ff-42df-8c8e-f979a0472c11',
'fc-secure-17d8dbc9-d1d8-4d5d-8eb7-c1b82bef24d8',
'fc-secure-674fbd89-9eeb-4e43-8a6f-97d6e50708e0',
'fc-secure-9f2f0267-2df4-44e9-a6ae-dd1d3a43cca5',
'fc-secure-6bc832d1-a35b-4676-bf68-a5772e2be044',
'fc-secure-e4b45d7c-3fee-479f-83e9-8c85312cb8da',
'fc-secure-516245cb-7dcc-487d-acf7-43e5fb10085f',
'fc-secure-ab235723-ed31-4242-b5ab-23c177a0e79c',
'fc-secure-b9906df4-3012-4c7b-a008-3c5708885971',
'fc-secure-05e511c4-0b47-41a5-a361-99f747cbef6c',
'fc-secure-adba6cb8-c49c-405b-af7b-9980e4a9d36a',
'fc-secure-98a7c433-bacc-44fd-96f6-faed04dd1c96',
'fc-secure-fd756575-ba39-4893-8b85-b6dfbb376f3b',
'fc-7dbe58d0-8579-48d7-abf8-215651316013',
'fc-secure-a473f80e-97a6-4c19-bd68-e37266efb44d',
'fc-secure-04e82709-08e0-4335-aaef-ba55089f6fd9',
'fc-secure-ed823158-2149-493f-80d0-ff066cb14a85',
'fc-secure-bcc5d428-aed0-4814-aefc-f717b97d5106',
'fc-secure-de72ef13-9b7f-44db-9428-5df489d327ce',
'fc-7fc7dc2b-f68e-4499-8384-3a3212113004',
'fc-secure-35c81df8-8bdf-467a-af6f-fb807185b82e',
'fc-secure-39458ab6-c2d3-49e2-b6d5-8bb3bae9a245',
'fc-secure-124c02b9-69b7-468c-b3d7-4a07aee74dc5',
'fc-secure-a065288d-5bb4-441c-95e9-0ffb20a6cf40',
'fc-secure-ee694ec4-cb3d-441d-95f7-e6d586419484',
'fc-secure-bd923846-0b8b-4018-8706-44b2a8e213b4',
'fc-secure-4c07a18a-8c79-4b81-acbe-91083298f1e4',
'fc-secure-538d85ea-c436-43f9-b001-4db614ed96bf',
'fc-99c362d9-0685-4532-9768-6bbc96bf3f16',
'fc-secure-d87970dd-adb0-4b99-a204-ae6fbd457d12',
'fc-secure-5f916770-fded-4540-b4b6-49f88b8e05fc',
'fc-secure-0aedc988-3736-496c-b7ac-20cca5b3ceb9',
'fc-secure-d4bead53-0db1-4e25-87da-c02be5819368',
'fc-secure-86cbdfa9-cbc0-40fb-adfa-3dd467ae1062',
'fc-secure-d157fd3c-57ff-4640-a084-cecda832e575',
'fc-secure-08bb70e6-9fa1-40dc-8822-41d73945c053',
'fc-secure-6a2f53f1-6712-48a9-a7b2-3289b8df877b',
'fc-secure-55225e12-ec4c-42e0-a5d1-986c87c6d129',
'fc-secure-89bba08d-ef3b-47bb-9c9b-a937d7550a97',
'fc-secure-bf34568b-1c38-4c43-8a21-59630b969553',
'fc-secure-8a297961-e042-4d02-826f-0322b3d7fbff',
'fc-secure-9befa92f-ef34-4fcf-8df5-d085656e26dd',
'fc-secure-870d27c3-a758-4535-b8dd-5fc0514c5215',
'fc-secure-980cd412-6b18-480a-b2f2-ad1543c06a91',
'fc-secure-16e0c63c-847a-42ef-91ca-3523b3668357',
'fc-secure-c53831f7-0431-44e5-abe6-308270690c3b',
'fc-secure-51a26e99-63eb-442a-869d-87ecbc60c814',
'fc-secure-e5676c90-7028-4b68-b620-c6944514d52c',
'fc-secure-977aa72f-e9ce-4fb6-b32b-c675b4ef25d5',
'fc-secure-d7a002ea-7e1e-45fd-8e76-456fce471f17',
'fc-secure-6537d7f6-f29f-432b-b66e-8cf2204b7920',
'fc-secure-b2669acd-7139-464f-af53-af7215c068aa',
'fc-secure-cb3eeabf-f0ef-497e-9bc6-b5a27be4fec2',
'fc-secure-2180b508-ce9d-4535-aa9f-f07d5917025c',
'fc-secure-7e0893cd-4f31-41e4-b1d2-3e656097824a',
'fc-secure-f8b9ce8d-efc0-4aa1-ad71-c0378d8d7194',
'fc-secure-b2f4e185-a21a-434a-9494-d1fabaaaf7c0',
'fc-secure-1355eb72-b00f-4796-8892-ac271b699503',
'fc-secure-68b7e62f-132b-4818-bf64-6c38ec9152ab',
'fc-f9f4111b-027e-4d27-923c-499f607cfa61',
'fc-secure-dae591de-00ad-478c-9440-88034a1b8cb9',
'fc-secure-7e9fe869-643a-4828-a1b7-0245e34745ae',
'fc-secure-228fd6fd-e0f7-4895-a246-3b055be27aa1',
'fc-secure-e99706c4-48f9-4a69-baf4-70d1c5eaac5c',
'fc-secure-d2c84e56-8f0d-420a-96a4-942e92009433',
'fc-secure-589e3f7a-7b24-46cf-aefd-63b05155d826',
'fc-secure-3fdbe020-6bdb-4668-bcb8-0d0df9d4ba8a',
'fc-secure-b31156cd-4993-4f69-a8f4-9a99c2697965',
'fc-secure-73036e74-c8b0-4e6f-9f4f-ca55b599d5d1',
'fc-secure-3c4843c0-b83f-4ba1-9bba-9c9a599f3ffb',
'fc-secure-91f9e579-b064-4992-8b00-c789ca48f861',
'fc-secure-ac588f86-da2d-4a92-9f45-be2aeedd5fac',
'fc-secure-2fa3df40-c189-41ee-b5ba-484a0b77ef77',
'fc-secure-00737009-4e0f-454d-bb02-4b70566a0ed2',
'fc-secure-36dfb67b-d2fc-47a1-a94c-225d72e08afd',
'fc-9197a911-c2f8-4f5f-91f9-389d191626d0',
'fc-secure-55efa443-810c-48c8-90bb-f07beba0e560',
'fc-secure-43207dac-0905-4fdd-b816-a34bd2ccebdd',
'fc-secure-382fb414-642b-45ef-9a85-c6c443da8691',
'fc-secure-fba19c6f-984e-4616-b253-6d9e6ea5cec5',
'fc-secure-1614d6d2-d053-4de0-9b97-cc4b0762f547',
'fc-41c77566-33d0-4523-a89e-fdad0b94f324',
'fc-secure-c40af798-8afc-4ab3-9b66-946955811d3b',
'fc-secure-abc7f058-0260-4e82-a911-abfec3dcb676',
'fc-secure-29cd113f-7eca-4526-aa52-dde1b8cb41d0',
'fc-secure-877e6c8c-72ef-46d0-b3f3-37dd175771fe',
'fc-secure-0eba3dae-89be-4642-8982-9a80a7428cd2',
'fc-secure-b41964ad-0c8a-47da-8504-f8636ff3d318',
'fc-secure-0ca0c5e6-26ca-47ea-b509-ec4eaa058fc6',
'fc-secure-bee7792c-ef35-478d-a9bb-c8f2054c335c',
'fc-secure-72a949c5-0b7d-45c9-96c3-ff4d25815ed5'
]

# Loop through buckets and record size and file count
results = []
for bucket in bucket_list:
    start = time.time()
    obj_list = []
    file_count = 0
    size = 0
    try:
        storage_client = storage.Client()
        storage_bucket = storage_client.bucket(bucket, user_project="anvil-datastorage")
        objects = list(storage_client.list_blobs(storage_bucket))
        file_count = len(objects)
        for i in range(0,file_count): size += objects[i].size
        status = "Success"
        fail_message = ""
    except Exception as e:
        status = "Failure"
        fail_message = f"; Fail Message: {str(e)}"
    end = time.time()
    duration = round(end-start,2)
    message = f"Duration: {duration}s{fail_message}"
    results.append([bucket, size, file_count, status, message])
    df_temp = pd.DataFrame([[bucket, size, file_count, status, message]], columns =["Bucket", "Size in Bytes", "Object Count", "Run Status", "Message"])
    display(df_temp)
print("---------------------------------------------------------------------------------")
print("---------------------------------------------------------------------------------")
print("---------------------------------------------------------------------------------")
df = pd.DataFrame(results, columns =["Bucket", "Size in Bytes", "Object Count", "Run Status", "Message"])
display(df)

In [None]:
df = pd.DataFrame(results, columns =["Bucket", "Size in Bytes", "Object Count", "Run Status", "Message"])
display(df)

# Pulling MD5 Population Across AnVIL

## Pulling High Level MD5 Population Stats

In [None]:
# Define parameters
billing_profile = "e0e03e48-5b96-45ec-baa4-8cc1ebf74c61"

# Establish API client
creds, project = google.auth.default()
auth_req = google.auth.transport.requests.Request()
creds.refresh(auth_req)
config = data_repo_client.Configuration()
config.host = "https://data.terra.bio"
config.access_token = creds.token
api_client = data_repo_client.ApiClient(configuration=config)
api_client.client_side_validation = False
datasets_api = data_repo_client.DatasetsApi(api_client=api_client)

# Loop through enumerated datasets and create records for those related to AnVIL
print(f"Start time: {datetime.datetime.now()}")
records_list = []
datasets_list = datasets_api.enumerate_datasets(limit=2000)
for dataset_entry in datasets_list.items:
    if dataset_entry.default_profile_id == billing_profile:
        # Retrieve dataset details and pull source workspace(s)
        dataset_details = datasets_api.retrieve_dataset(id=dataset_entry.id, include=["ACCESS_INFORMATION", "PROPERTIES"]).to_dict()
        try:
            source_workspace = ",".join(dataset_details["properties"]["source_workspaces"])
        except:
            source_workspace = ""
        
        # Pull data file size sum from BigQuery
        bq_project = dataset_details["access_information"]["big_query"]["project_id"]
        bq_schema = dataset_details["access_information"]["big_query"]["dataset_name"]
        client = bigquery.Client()
        file_size_query = """SELECT COUNT(*) AS file_count, COUNT(md5_hash) AS file_w_md5 FROM `{project}.{schema}.file_inventory`""".format(project = bq_project, schema = bq_schema)
        try:
            df_output = client.query(file_size_query).result().to_dataframe()
            file_count = df_output["file_count"].values[0]
            file_w_md5 = df_output["file_w_md5"].values[0]
            if file_count != file_w_md5:
                missing_md5 = True
            else:
                missing_md5 = False
            status = "Success"
        except:
            file_count = 0
            file_w_md5 = 0
            missing_md5 = False
            status = "Error"
    
        # Build record for dataset
        record = [dataset_entry.id, dataset_entry.name, source_workspace, file_count, file_w_md5, missing_md5, status]
        records_list.append(record)
        
# Read records into a dataframe
df = pd.DataFrame(records_list, columns =["Dataset UUID", "Dataset Name", "Source Workspaces", "File Count", "MD5 Populated Count", "Missing MD5s", "Retrieval Status"])
df_sorted = df.sort_values(["Source Workspaces"], ascending=[True], ignore_index=True)
print(f"End time: {datetime.datetime.now()}")
display(df_sorted)

Start time: 2023-03-13 12:44:46.685625


## Pulling Specific Problematic Files

In [None]:
# Define parameters
billing_profile = "e0e03e48-5b96-45ec-baa4-8cc1ebf74c61"

# Establish API client
creds, project = google.auth.default()
auth_req = google.auth.transport.requests.Request()
creds.refresh(auth_req)
config = data_repo_client.Configuration()
config.host = "https://data.terra.bio"
config.access_token = creds.token
api_client = data_repo_client.ApiClient(configuration=config)
api_client.client_side_validation = False
datasets_api = data_repo_client.DatasetsApi(api_client=api_client)

# Loop through enumerated datasets and create records for those related to AnVIL
print(f"Start time: {datetime.datetime.now()}")

df_results = pd.DataFrame(columns = ["Dataset UUID", "Dataset Name", "Source Workspaces", "File Path", "Byte Size", "Retrieval Status"])
records_list = []
datasets_list = datasets_api.enumerate_datasets(limit=2000)
for dataset_entry in datasets_list.items:
    if dataset_entry.default_profile_id == billing_profile:
        # Retrieve dataset details and pull source workspace(s)
        dataset_details = datasets_api.retrieve_dataset(id=dataset_entry.id, include=["ACCESS_INFORMATION", "PROPERTIES"]).to_dict()
        try:
            source_workspace = ",".join(dataset_details["properties"]["source_workspaces"])
        except:
            source_workspace = ""
        
        # Pull data file size sum from BigQuery
        bq_project = dataset_details["access_information"]["big_query"]["project_id"]
        bq_schema = dataset_details["access_information"]["big_query"]["dataset_name"]
        client = bigquery.Client()
        file_size_query = """SELECT uri AS file_name, size_in_bytes AS file_size FROM `{project}.{schema}.file_inventory` WHERE md5_hash IS NULL""".format(project = bq_project, schema = bq_schema)
        try:
            output = client.query(file_size_query).result()
            if output.total_rows > 0:
                df_output = output.to_dataframe()
                df_output.rename(columns = {"file_name":"File Path", "file_size":"Byte Size"}, inplace = True)
                df_output["Dataset UUID"] = dataset_entry.id
                df_output["Dataset Name"] = dataset_entry.name
                df_output["Source Workspaces"] = source_workspace
                df_output["Retrieval Status"] = "Success - Files Found"
                df_results = df_results.append(df_output)
            else:
                output = [[dataset_entry.id, dataset_entry.name, source_workspace, None, 0, "Success - No Files Found"]]
                df_output = pd.DataFrame(output, columns = ["Dataset UUID", "Dataset Name", "Source Workspaces", "File Path", "Byte Size", "Retrieval Status"])
                df_results = df_results.append(df_output)
        except:
            output = [[dataset_entry.id, dataset_entry.name, source_workspace, None, 0, "Error"]]
            df_output = pd.DataFrame(output, columns = ["Dataset UUID", "Dataset Name", "Source Workspaces", "File Path", "Byte Size", "Retrieval Status"])
            df_results = df_results.append(df_output)
        
# Sort dataframe records and write out to file
df_sorted = df_results.sort_values(["Source Workspaces", "File Path"], ascending=[True, True], ignore_index=True)
output_file_path = "null_md5_files.tsv"
df_sorted.to_csv(output_file_path, index=False, sep="\t")
!gsutil cp $output_file_path $ws_bucket/ingest_pipeline/resources/ 2> stdout
!rm $output_file_path
print(f"End time: {datetime.datetime.now()}")
print(f"Results copied to: {ws_bucket}/ingest_pipeline/resources/{output_file_path}")