In [None]:
#!pip install --upgrade data_repo_client

In [None]:
# Imports
import import_ipynb
import ingest_pipeline_utilities as utils
import data_repo_client
from google.cloud import bigquery
from google.cloud import storage
import google.auth
import google.auth.transport.requests
import pandas as pd
import datetime
import os
import re
import time
import requests
ws_name = os.environ["WORKSPACE_NAME"]
ws_project = os.environ["WORKSPACE_NAMESPACE"]
ws_bucket = os.environ["WORKSPACE_BUCKET"]
ws_bucket_name = re.sub('^gs://', '', ws_bucket)

# Display options
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option("display.max_colwidth", None)
pd.set_option('display.width', 1000)
pd.set_option('display.colheader_justify', 'center')
pd.set_option('display.precision', 3)


# AnVIL Resource Access Control

## Bulk Add Users to Workspaces (and associated Auth Domains)

In [None]:
# Grant my ncalvane account workspace access/auth_domain member access for workspaces
user_role_list = [
    #["user_email", "role - READER, WRITER, OWNER, NO ACCESS"]
     ["ncalvane@broadinstitute.org", "WRITER"]
]
workspace_list = [
'AnVIL_CCDG_Broad_MI_BRAVE_GRU_WES',
'AnVIL_HPRC',
]

for user_role in user_role_list:
    user = user_role[0]
    role = user_role[1]
    print(f"Processing ACL updates for user: {user}")
    results = []
    for workspace in workspace_list:

        # Initialize
        print(f"\tProcessing ACL updates for {workspace}.")
        error_list = []

        # Establish credentials
        creds, project = google.auth.default()
        auth_req = google.auth.transport.requests.Request()
        creds.refresh(auth_req)

        # Add user as writer on workspace
        payload = [{
            "email": user,
            "accessLevel": role,
            "canShare": True,
            "canCompute": True
        }]
        response = requests.patch(
            url=f"https://api.firecloud.org/api/workspaces/anvil-datastorage/{workspace}/acl",
            headers={"Authorization": f"Bearer {creds.token}"},
            json=payload
        )
        if response.status_code != 200:
            error_list.append("Error adding to workspace ACL")

        # Pull workspace attributes
        ws_attributes = requests.get(
            url=f"https://api.firecloud.org/api/workspaces/anvil-datastorage/{workspace}?fields=workspace.attributes,workspace.authorizationDomain,workspace.googleProject,workspace.bucketName",
            headers={"Authorization": f"Bearer {creds.token}"}
        ).json()

        # Add user to auth domains
        try:
            for ad in ws_attributes["workspace"]["authorizationDomain"]:
                auth_domain = ad["membersGroupName"]
                response = requests.put(
                    url=f"https://api.firecloud.org/api/groups/{auth_domain}/member/{user}",
                    headers={"Authorization": f"Bearer {creds.token}"}
                )
                if response.status_code != 204:
                    error_list.append(f"Error adding to auth domain ({auth_domain})")
        except:
            error_list.append(f"Error accessing workspace.")

        # Record status
        status = "Success" if not error_list else "Failure"
        error_str = "; ".join(error_list)
        results.append([workspace, status, error_str])

    # Display results
    print(f"\nResults for user: {user}")
    results_df = pd.DataFrame(results, columns = ["workspace", "status", "errors"])
    display(results_df)
    


# TDR Reader Management

## Remove Undesired Readers from TDR Datasets

In [None]:
# Function to remove erroneous readers from snapshot
def clean_up_ad_readers(snapshot_id, readers):
    print("Cleaning up readers for {}...".format(snapshot_id))
    reader_list = readers
    api_client = utils.refresh_tdr_api_client()
    snapshots_api = data_repo_client.SnapshotsApi(api_client=api_client)
    # Retrieve snapshot, grab auth_domain
    if '$AUTH_DOMAIN' in reader_list:
        snapshot_response = snapshots_api.retrieve_snapshot(id=snapshot_id)
        snapshot_name = snapshot_response.name
        print("Snapshot name: {}".format(snapshot_name))
        try:
            auth_domain_list = snapshot_response.source[0].dataset_properties["auth_domains"]
        except:
            auth_domain_list = []
        for ad in auth_domain_list:
            reader_list.append(ad + "@firecloud.org")

    # Retrieve snapshot policies and delete readers that aren't in reader list
    snapshot_policy_response = snapshots_api.retrieve_snapshot_policies(id=snapshot_id)
    delete_count = 0
    for policy in snapshot_policy_response.policies:
        if policy.name == "reader":
            for policymember in policy.members:
                if policymember not in reader_list:
                    api_client = utils.refresh_tdr_api_client()
                    snapshots_api = data_repo_client.SnapshotsApi(api_client=api_client)
                    retry_count = 0
                    while retry_count < 1:
                        try:
                            delete_response = snapshots_api.delete_snapshot_policy_member(id=snapshot_id, policy_name="reader", member_email=policymember)
                            delete_count += 1
                            break
                        except:
                            retry_count += 1
                        
    # Print results
    snapshot_policy_response = snapshots_api.retrieve_snapshot_policies(id=snapshot_id)
    print(f"\t{delete_count} erroneous readers deleted.")
    
    for role in snapshot_policy_response.policies:
        if role.name == "reader":
            rem_readers = ", ".join(role.members)
            print(f"\tRemaining readers: {rem_readers}")
    return 

# Clean-up snapshots
reader_list = ["azul-anvil-prod@firecloud.org"]#, '$AUTH_DOMAIN']
snapshot_id_list = [
'b0fc6253-d274-4e53-9977-85d943116f7c',
]
for snapshot_id in snapshot_id_list:
    clean_up_ad_readers(snapshot_id, reader_list)


## Add Auth Domain Users to TDR Datasets

In [None]:
# Function to remove add readers to snapshot --> TO BE DONE WHEN ADs NEED TO BE ADDED BACK


# Collect AnVIL Snapshots and Datasets

In [None]:
# Collect Anvil datasets and snapshots
api_client = utils.refresh_tdr_api_client()
datasets_api = data_repo_client.DatasetsApi(api_client=api_client)
snapshots_api = data_repo_client.SnapshotsApi(api_client=api_client)
datasets_list = datasets_api.enumerate_datasets(filter="anvil", limit=2000)
records_list = []
for dataset_entry in datasets_list.items:
    if re.match("^ANVIL_[a-zA-Z0-9-_]+_[0-9]{8}", dataset_entry.name.upper()):
        dataset_detail = datasets_api.retrieve_dataset(id=dataset_entry.id)
        snapshots_list = snapshots_api.enumerate_snapshots(dataset_ids=[dataset_entry.id], limit=1000)
        if len(snapshots_list.items) == 0:
            record = [None, None, None, None, None, dataset_entry.id, dataset_entry.name, dataset_detail.ingest_service_account, dataset_entry.created_date[0:10]]
            records_list.append(record)
        else:
            for snapshot_entry in snapshots_list.items:
                record = [snapshot_entry.id, snapshot_entry.name, snapshot_entry.data_project, snapshot_entry.created_date[0:10], snapshot_entry.created_date, dataset_entry.id, dataset_entry.name, dataset_detail.ingest_service_account, dataset_entry.created_date[0:10]]
                records_list.append(record)
df = pd.DataFrame(records_list, columns =["Snapshot ID", "Snapshot Name", "Snapshot Google Project", "Snapshot Created Date", "Snapshot Created Datetime", "Source Dataset ID", "Source Dataset Name", "Source Dataset SA", "Source Dataset Created Date"])
df_sorted = df.sort_values(["Source Dataset Name", "Snapshot Name"], ascending=[True, True], ignore_index=True)
display(df_sorted)


# Snapshot Row Count Collection

In [None]:
def return_row_counts(snapshot_id, results_list):
    # Grab access information from schema
    api_client = utils.refresh_tdr_api_client()
    snapshots_api = data_repo_client.SnapshotsApi(api_client=api_client)
    try:
        response = snapshots_api.retrieve_snapshot(id=snapshot_id, include=["TABLES", "ACCESS_INFORMATION"]).to_dict()
        tdr_schema_dict = {}
        tdr_schema_dict["tables"] = response["tables"]
        bq_project = response["access_information"]["big_query"]["project_id"]
        bq_dataset = response["access_information"]["big_query"]["dataset_name"]
    except:
        results_list.append([snapshot_id, 0])
        return results_list
    
    # Build row count query
    table_set = set()
    table_count = 0
    row_count_subquery = ""
    for table_entry in tdr_schema_dict["tables"]:
        table_set.add(table_entry["name"])
    for table_entry in table_set:
        table_count += 1
        if table_count == 1:
            row_count_subquery += "SELECT datarepo_row_id FROM `{project}.{dataset}.{table}` ".format(project=bq_project, dataset=bq_dataset, table=table_entry)
        else:
            row_count_subquery += "UNION ALL SELECT datarepo_row_id FROM `{project}.{dataset}.{table}` ".format(project=bq_project, dataset=bq_dataset, table=table_entry)
    row_count_query = "SELECT COUNT(*) AS row_count FROM ({subquery})".format(subquery=row_count_subquery)
    
    # Execute query and write results to results dict
    try:
        client = bigquery.Client()
        df_results = client.query(row_count_query).result().to_dataframe()
        row_count = df_results["row_count"].values[0]
        results_list.append([snapshot_id, row_count])
    except:
        results_list.append([snapshot_id, 0])
    return results_list
    
# Loop through snapshots and collect row counts
results_list = []
snapshot_id_list = [
'bb7eaad8-b02c-455c-964d-c9242019d9e5',
]
for snapshot_id in snapshot_id_list:
    results_list = return_row_counts(snapshot_id, results_list)
    
# Convert results to dataframe and display
results_df = pd.DataFrame(results_list, columns = ["snapshot_id", "row_count"])
display(results_df)


# Pulling Dataset Sizes Across AnVIL

## Pulling file counts and sizes from TDR

In [None]:
# Define parameters
billing_profile = "e0e03e48-5b96-45ec-baa4-8cc1ebf74c61"

# Establish API client
creds, project = google.auth.default()
auth_req = google.auth.transport.requests.Request()
creds.refresh(auth_req)
config = data_repo_client.Configuration()
config.host = "https://data.terra.bio"
config.access_token = creds.token
api_client = data_repo_client.ApiClient(configuration=config)
api_client.client_side_validation = False
datasets_api = data_repo_client.DatasetsApi(api_client=api_client)

# Loop through enumerated datasets and create records for those related to AnVIL
print(f"Start time: {datetime.datetime.now()}")
records_list = []
datasets_list = datasets_api.enumerate_datasets(limit=2000)
for dataset_entry in datasets_list.items:
    if dataset_entry.default_profile_id == billing_profile:
        # Retrieve dataset details and pull source workspace(s)
        dataset_details = datasets_api.retrieve_dataset(id=dataset_entry.id, include=["ACCESS_INFORMATION", "PROPERTIES"]).to_dict()
        try:
            source_workspace = ",".join(dataset_details["properties"]["source_workspaces"])
        except:
            source_workspace = ""
        
        # Pull data file size sum from BigQuery
        bq_project = dataset_details["access_information"]["big_query"]["project_id"]
        bq_schema = dataset_details["access_information"]["big_query"]["dataset_name"]
        client = bigquery.Client()
        file_size_query = """SELECT COUNT(*) AS file_count, COALESCE(SUM(size_in_bytes),0) AS file_size FROM `{project}.{schema}.file_inventory`""".format(project = bq_project, schema = bq_schema)
        try:
            df_output = client.query(file_size_query).result().to_dataframe()
            file_count = df_output["file_count"].values[0]
            byte_size = df_output["file_size"].values[0]
            status = "Success"
        except:
            file_count = 0
            byte_size = 0
            status = "Error"
    
        # Build record for dataset
        record = [dataset_entry.id, dataset_entry.name, source_workspace, file_count, byte_size, status]
        records_list.append(record)
        
# Read records into a dataframe
df = pd.DataFrame(records_list, columns =["Dataset UUID", "Dataset Name", "Source Workspaces", "File Count", "File Size (Bytes)", "Retrieval Status"])
df["File Size (Bytes)"] = df["File Size (Bytes)"].astype(int).astype(str)
df_sorted = df.sort_values(["Source Workspaces"], ascending=[True], ignore_index=True)
print(f"End time: {datetime.datetime.now()}")
display(df_sorted)

## Pulling file counts and sizes from WS Buckets

In [None]:
# List of buckets:
bucket_list = [
'fc-secure-e202b041-84ec-4ca9-80de-60ba7dda6178',
'fc-secure-21247341-3748-47a4-9dbe-42b8839446f9',
'fc-secure-8b2e7dfb-cf02-411b-8b17-9b15937794e7',
'fc-46bf051e-aec3-4adb-8178-3c51bc5e64ae',
'fc-89ed8c3d-3e5e-455b-952b-e4cdf21a08f9',
'fc-secure-b47581eb-772e-4bd3-9c30-614ad11ac955',
'fc-secure-a3a7e66a-6268-48a6-a001-fc3a6d2290d2',
'fc-97c4f159-5d6b-44ce-904c-8ccd16b28388',
'fc-secure-89d09af9-ce39-42b6-8b41-735cfe64561c',
'fc-secure-e9b536b4-a871-4a9d-8822-85658c7f8fc3',
'fc-secure-494df267-5bd4-4518-8ea0-6358420d9ecb',
'fc-secure-827d3809-7703-47db-b6fb-08a463569419',
'fc-secure-59794551-d924-4ad7-905b-8727646d9aad',
'fc-secure-3fe46cfd-c69a-4d77-aed5-2932e91abc8d',
'fc-secure-449c0e31-cb6a-478b-a0a0-e617d822f9fb',
'fc-secure-b66a9e2f-896e-4786-a601-4714fe1f78d7',
'fc-secure-86a7da7d-f63c-44ae-b25f-5dbec63209b4',
'fc-secure-e211f4ca-b14a-40d2-8408-8faa8cb1e81a',
'fc-secure-6df86ae7-d972-4019-961b-a4214fe7437a',
'fc-secure-9e3357c0-389c-41d7-94ee-56673db6b75f',
'fc-secure-22309b02-2059-4d9f-be94-9bdbed77ff24',
'fc-secure-408943c5-2694-4538-b729-73a61ac948f1',
'fc-secure-4a0f9fdc-6316-4e82-86a8-88598bdf6882',
'fc-secure-4ab74011-4864-4f81-83d8-825e378bb67b',
'fc-secure-dd8b31a3-b890-4b85-9edb-477daed4fa65',
'fc-secure-afd398cc-e3c9-4d65-94b3-28427b3a02d7',
'fc-secure-3bbde5a9-b898-41d6-8ce3-2c81ca53e34f',
'fc-secure-a563ccd1-2a2f-4ecf-8b87-83ad66b24ed1',
'fc-secure-e028b530-4f98-4e35-a698-b3a47ab4fc0e',
'fc-secure-862a13b2-7696-442e-aa14-0e2241f43c87',
'fc-secure-fde12f71-78e4-4512-9ee4-a98cdac6ea63',
'fc-secure-65e0696d-6c1c-46f5-b68c-7189acbb12e6',
'fc-secure-a183fc0d-9291-4fa4-86e4-10f66954dea6',
'fc-secure-3c0c744a-f0dd-4ffa-a4a8-43d2f9abe890',
'fc-secure-fa2670e5-f99c-4a21-8915-dfb3b2964ac8',
'fc-secure-2195f7f0-69eb-4d8f-b9c4-60ff45201f87',
'fc-secure-b8d20102-3579-464c-9b5b-d086c8076d34',
'fc-secure-b8e21bed-89ae-4aca-a4a1-a07fc2ea6f4e',
'fc-secure-f27a9d2e-f2a0-4eb3-8276-fe8a570ba591',
'fc-secure-e5065947-acd3-4407-9b96-6a5443b7f7f5',
'fc-secure-0914b286-b97e-43c6-a0c5-d9742258ec9b',
'fc-secure-fd35dec1-45c3-41ca-9156-fa16f004413e',
'fc-secure-ceb4e1fe-841d-4d06-8213-eec1397b0af7',
'fc-secure-2c8de1e3-2ea8-4ba9-a8bc-4e0b907305ec',
'fc-secure-7d249549-33e9-4b96-9448-31bb6b944c71',
'fc-secure-389d967f-63f5-4729-9cbe-52c14f071ac1',
'fc-secure-00819d85-1657-472b-95f7-0ccb934d310b',
'fc-secure-29a7c62b-bf87-4ef9-b1c7-b28fb178d400',
'fc-secure-e6b3315b-eefa-444a-87d5-fd86163a0d40',
'fc-secure-e0503432-75b9-4674-8e6d-2597dc529c4c',
'fc-secure-e442bbfd-4364-47f7-8fce-1c249347bf1a',
'fc-secure-f7f89b29-6590-4119-a259-08745d6b564a',
'fc-secure-b1713b51-6a84-419c-b979-5c9454e05561',
'fc-secure-a343ca65-169f-4a71-883c-b10aef1180f3',
'fc-secure-e3e31dd4-66e6-462a-a8ea-aefb73828777',
'fc-secure-180323ab-f749-4063-ae83-3bb93c739046',
'fc-secure-863474e6-0c25-4b97-a471-a6070227c7ab',
'fc-secure-d4bead53-0db1-4e25-87da-c02be5819368',
'fc-secure-89bba08d-ef3b-47bb-9c9b-a937d7550a97',
'fc-secure-870d27c3-a758-4535-b8dd-5fc0514c5215',
'fc-secure-a5618092-a9f4-447a-850f-21739f6c0c83',
'fc-secure-51788316-df65-4bc8-b6b3-e4a50f3de6cd',
'fc-secure-c3d677a2-778e-4459-ab03-ad08dd00f69e',
'fc-secure-c31dbfd1-8654-4c71-93ca-567439c75193',
'fc-secure-4d553a3c-b4fa-4ddf-9dfa-d945b8bf16b7',
'fc-secure-c794198e-e001-4695-8c56-130d202d4eec',
'fc-secure-deeb8626-424b-4a7e-b500-e15bcd245f2c',
'fc-secure-70409918-97e6-498c-a564-3d816da25184',
'fc-secure-be72b12d-7097-45ae-b624-20d1fbcb7d37',
'fc-secure-62493204-9a02-4282-8b22-fd69a25aa5e8',
'fc-secure-dd54449b-efc7-4a81-9e83-25e5efdbe4c5',
'fc-secure-8ccb6590-51a3-4d87-8503-3c5ca94d3443',
'fc-secure-5845816b-03bd-4ea7-9054-633abe95ba6b',
'fc-secure-c824996b-683d-430c-919b-a617ca092140',
'fc-secure-1ffeb3d9-85e3-4fbc-bf80-3e4785f1ef43',
'fc-secure-aecef290-3094-4070-9db2-91b23e3a2284',
'fc-secure-fbfd37de-ac4b-46e3-9cb0-b76f7789ca4a',
'fc-secure-ef198352-7a8f-40df-b351-d9c17f8b8bf2',
'fc-secure-9dd49799-299d-4686-a21c-9df0abfaddc5',
'fc-secure-f04b0ded-dcf2-4872-a447-b58f43d760a1',
'fc-secure-6058ce81-98fd-43fd-bb65-32d59e960370',
'fc-secure-1cb49bd3-c89b-4ebc-9d5c-7e2c941981a2',
'fc-secure-69af8f70-c0b3-4ffd-bdbe-cd18aecb1b3d',
'fc-secure-32713de1-b20c-4986-a916-618fd0dfed20',
'fc-02e8222a-9286-425f-a2c3-92d3b9786807',
'fc-db8d28e8-e27d-4c0c-8559-2ac15d4f82c9',
'fc-secure-cf7b22ec-6585-47e0-a96d-8037a4d3a670',
'fc-fe57063d-7dfd-4a77-a1a9-ce8007f796e7',
'fc-secure-11637185-0c16-4b4c-aec3-4926f280292c',
'fc-secure-5fa642ee-c48f-4b5e-8a0e-46d625d0b4ca',
'fc-secure-f19f92e2-92c0-4ae6-ac9e-26e09733dcd2',
'fc-secure-bae1021e-0789-41b7-94af-8461afe58856',
'fc-secure-e195dd68-4f18-405c-8955-efb6aa4a55b6',
'fc-secure-db8c2ae9-ef80-4516-8de2-61af9675b3e9',
'fc-secure-ded35a4b-965f-42be-a471-4bd1ea830170',
'fc-secure-3a8dd10b-8615-40dd-819b-66721cbeb3bc',
'fc-secure-aa1fee47-c5d1-4468-ac5f-eb2f11c97e9c',
]

# Loop through buckets and record size and file count
print(f"Start time: {datetime.datetime.now()}")
results = []
for bucket in bucket_list:
    start = time.time()
    obj_list = []
    file_count = 0
    size = 0
    try:
        storage_client = storage.Client()
        storage_bucket = storage_client.bucket(bucket, user_project="anvil-datastorage")
        objects = list(storage_client.list_blobs(storage_bucket))
        file_count = len(objects)
        for i in range(0,file_count): size += objects[i].size
        status = "Success"
        fail_message = ""
    except Exception as e:
        status = "Failure"
        fail_message = f"; Fail Message: {str(e)}"
    end = time.time()
    duration = round(end-start,2)
    message = f"Duration: {duration}s{fail_message}"
    results.append([bucket, size, file_count, status, message])
    df_temp = pd.DataFrame([[bucket, size, file_count, status, message]], columns =["Bucket", "Size in Bytes", "Object Count", "Run Status", "Message"])
    display(df_temp)
print("---------------------------------------------------------------------------------")
print("---------------------------------------------------------------------------------")
print("---------------------------------------------------------------------------------")  
df = pd.DataFrame(results, columns =["Bucket", "Size in Bytes", "Object Count", "Run Status", "Message"])
print(f"End time: {datetime.datetime.now()}")
display(df)

# Pulling MD5 Population Across AnVIL

## Pulling High Level MD5 Population Stats

In [None]:
# Define parameters
billing_profile = "e0e03e48-5b96-45ec-baa4-8cc1ebf74c61"

# Establish API client
creds, project = google.auth.default()
auth_req = google.auth.transport.requests.Request()
creds.refresh(auth_req)
config = data_repo_client.Configuration()
config.host = "https://data.terra.bio"
config.access_token = creds.token
api_client = data_repo_client.ApiClient(configuration=config)
api_client.client_side_validation = False
datasets_api = data_repo_client.DatasetsApi(api_client=api_client)

# Loop through enumerated datasets and create records for those related to AnVIL
print(f"Start time: {datetime.datetime.now()}")
records_list = []
datasets_list = datasets_api.enumerate_datasets(limit=2000)
for dataset_entry in datasets_list.items:
    if dataset_entry.default_profile_id == billing_profile:
        # Retrieve dataset details and pull source workspace(s)
        dataset_details = datasets_api.retrieve_dataset(id=dataset_entry.id, include=["ACCESS_INFORMATION", "PROPERTIES"]).to_dict()
        try:
            source_workspace = ",".join(dataset_details["properties"]["source_workspaces"])
        except:
            source_workspace = ""
        
        # Pull data file size sum from BigQuery
        bq_project = dataset_details["access_information"]["big_query"]["project_id"]
        bq_schema = dataset_details["access_information"]["big_query"]["dataset_name"]
        client = bigquery.Client()
        file_size_query = """SELECT COUNT(*) AS file_count, COUNT(md5_hash) AS file_w_md5 FROM `{project}.{schema}.file_inventory`""".format(project = bq_project, schema = bq_schema)
        try:
            df_output = client.query(file_size_query).result().to_dataframe()
            file_count = df_output["file_count"].values[0]
            file_w_md5 = df_output["file_w_md5"].values[0]
            if file_count != file_w_md5:
                missing_md5 = True
            else:
                missing_md5 = False
            status = "Success"
        except:
            file_count = 0
            file_w_md5 = 0
            missing_md5 = False
            status = "Error"
    
        # Build record for dataset
        record = [dataset_entry.id, dataset_entry.name, source_workspace, file_count, file_w_md5, missing_md5, status]
        records_list.append(record)
        
# Read records into a dataframe
df = pd.DataFrame(records_list, columns =["Dataset UUID", "Dataset Name", "Source Workspaces", "File Count", "MD5 Populated Count", "Missing MD5s", "Retrieval Status"])
df_sorted = df.sort_values(["Source Workspaces"], ascending=[True], ignore_index=True)
print(f"End time: {datetime.datetime.now()}")
display(df_sorted)

## Pulling Specific Problematic Files

In [None]:
# Define parameters
billing_profile = "e0e03e48-5b96-45ec-baa4-8cc1ebf74c61"

# Establish API client
creds, project = google.auth.default()
auth_req = google.auth.transport.requests.Request()
creds.refresh(auth_req)
config = data_repo_client.Configuration()
config.host = "https://data.terra.bio"
config.access_token = creds.token
api_client = data_repo_client.ApiClient(configuration=config)
api_client.client_side_validation = False
datasets_api = data_repo_client.DatasetsApi(api_client=api_client)

# Loop through enumerated datasets and create records for those related to AnVIL
print(f"Start time: {datetime.datetime.now()}")

df_results = pd.DataFrame(columns = ["Dataset UUID", "Dataset Name", "Source Workspaces", "File Path", "Byte Size", "Retrieval Status"])
records_list = []
datasets_list = datasets_api.enumerate_datasets(limit=2000)
for dataset_entry in datasets_list.items:
    if dataset_entry.default_profile_id == billing_profile:
        # Retrieve dataset details and pull source workspace(s)
        dataset_details = datasets_api.retrieve_dataset(id=dataset_entry.id, include=["ACCESS_INFORMATION", "PROPERTIES"]).to_dict()
        try:
            source_workspace = ",".join(dataset_details["properties"]["source_workspaces"])
        except:
            source_workspace = ""
        
        # Pull data file size sum from BigQuery
        bq_project = dataset_details["access_information"]["big_query"]["project_id"]
        bq_schema = dataset_details["access_information"]["big_query"]["dataset_name"]
        client = bigquery.Client()
        file_size_query = """SELECT uri AS file_name, size_in_bytes AS file_size FROM `{project}.{schema}.file_inventory` WHERE md5_hash IS NULL""".format(project = bq_project, schema = bq_schema)
        try:
            output = client.query(file_size_query).result()
            if output.total_rows > 0:
                df_output = output.to_dataframe()
                df_output.rename(columns = {"file_name":"File Path", "file_size":"Byte Size"}, inplace = True)
                df_output["Dataset UUID"] = dataset_entry.id
                df_output["Dataset Name"] = dataset_entry.name
                df_output["Source Workspaces"] = source_workspace
                df_output["Retrieval Status"] = "Success - Files Found"
                df_results = df_results.append(df_output)
            else:
                output = [[dataset_entry.id, dataset_entry.name, source_workspace, None, 0, "Success - No Files Found"]]
                df_output = pd.DataFrame(output, columns = ["Dataset UUID", "Dataset Name", "Source Workspaces", "File Path", "Byte Size", "Retrieval Status"])
                df_results = df_results.append(df_output)
        except:
            output = [[dataset_entry.id, dataset_entry.name, source_workspace, None, 0, "Error"]]
            df_output = pd.DataFrame(output, columns = ["Dataset UUID", "Dataset Name", "Source Workspaces", "File Path", "Byte Size", "Retrieval Status"])
            df_results = df_results.append(df_output)
        
# Sort dataframe records and write out to file
df_sorted = df_results.sort_values(["Source Workspaces", "File Path"], ascending=[True, True], ignore_index=True)
output_file_path = "null_md5_files.tsv"
df_sorted.to_csv(output_file_path, index=False, sep="\t")
!gsutil cp $output_file_path $ws_bucket/ingest_pipeline/resources/ 2> stdout
!rm $output_file_path
print(f"End time: {datetime.datetime.now()}")
print(f"Results copied to: {ws_bucket}/ingest_pipeline/resources/{output_file_path}")

# Pulling Specific Files Across AnVIL

In [None]:
# Define parameters
billing_profile = "e0e03e48-5b96-45ec-baa4-8cc1ebf74c61"
search_string = "SubsetHailJointCall"

# Establish API client
creds, project = google.auth.default()
auth_req = google.auth.transport.requests.Request()
creds.refresh(auth_req)
config = data_repo_client.Configuration()
config.host = "https://data.terra.bio"
config.access_token = creds.token
api_client = data_repo_client.ApiClient(configuration=config)
api_client.client_side_validation = False
datasets_api = data_repo_client.DatasetsApi(api_client=api_client)

# Loop through enumerated datasets and create records for those related to AnVIL
print(f"Start time: {datetime.datetime.now()}")

df_results = pd.DataFrame(columns = ["Dataset UUID", "Dataset Name", "Source Workspaces", "File Count", "Match File Count", "Distinct File Count", "Retrieval Status"])
records_list = []
datasets_list = datasets_api.enumerate_datasets(limit=2000)
for dataset_entry in datasets_list.items:
    if dataset_entry.default_profile_id == billing_profile:
        # Retrieve dataset details and pull source workspace(s)
        dataset_details = datasets_api.retrieve_dataset(id=dataset_entry.id, include=["ACCESS_INFORMATION", "PROPERTIES"]).to_dict()
        try:
            source_workspace = ",".join(dataset_details["properties"]["source_workspaces"])
        except:
            source_workspace = ""
        
        # Pull data file size sum from BigQuery
        bq_project = dataset_details["access_information"]["big_query"]["project_id"]
        bq_schema = dataset_details["access_information"]["big_query"]["dataset_name"]
        client = bigquery.Client()
        file_size_query = """SELECT COUNT(*) AS file_count,  
                            COUNT(CASE WHEN uri like '%{search_string}%' THEN 1 END) AS match_file_count,
                            COUNT(DISTINCT uri) AS distinct_file_count 
                            FROM `{project}.{schema}.file_inventory`""".format(project = bq_project, schema = bq_schema, search_string = search_string)
        try:
            output = client.query(file_size_query).result()
            if output.total_rows > 0:
                df_output = output.to_dataframe()
                df_output.rename(columns = {"file_count":"File Count", "match_file_count":"Match File Count", "distinct_file_count":"Distinct File Count"}, inplace = True)
                df_output["Dataset UUID"] = dataset_entry.id
                df_output["Dataset Name"] = dataset_entry.name
                df_output["Source Workspaces"] = source_workspace
                df_output["Retrieval Status"] = "Success - Files Found"
                df_results = df_results.append(df_output)
            else:
                output = [[dataset_entry.id, dataset_entry.name, source_workspace, 0, 0, 0, "Success - No Files Found"]]
                df_output = pd.DataFrame(output, columns = ["Dataset UUID", "Dataset Name", "Source Workspaces", "File Path", "Byte Size", "Retrieval Status"])
                df_results = df_results.append(df_output)
        except:
            output = [[dataset_entry.id, dataset_entry.name, source_workspace, 0, 0, 0, "Error"]]
            df_output = pd.DataFrame(output, columns = ["Dataset UUID", "Dataset Name", "Source Workspaces", "File Count", "Match File Count", "Distinct File Count", "Retrieval Status"])
            df_results = df_results.append(df_output)
        
# Sort dataframe records and display results
df_sorted = df_results.sort_values(["Source Workspaces"], ascending=[True], ignore_index=True)
print(f"End time: {datetime.datetime.now()}")
print("Results:")
display(df_sorted)
