# Backup Workspace
## Author: Jonn Smith
## Date: 2023/02/28
Detect all workspace data and tables, then download and back them up.
***

In [1]:
import os
import datetime
import gzip
import io

import pandas as pd
import firecloud.api as fapi
import numpy as np

from google.cloud import bigquery
from google.cloud import storage
from google.api_core.exceptions import NotFound

## Environment
Set up our environment (Terra namespace, workspace, and the location of the bucket(s)).

In [2]:
namespace = os.environ['WORKSPACE_NAMESPACE']
workspace = os.environ['WORKSPACE_NAME']
default_bucket = os.environ['WORKSPACE_BUCKET']

print(f"Namespace: {namespace}")
print(f"Workspace: {workspace}")
print(f"Default Bucket: {default_bucket}")

Namespace: broad-firecloud-dsde-methods
Workspace: sr-malaria
Default Bucket: gs://fc-b06e896e-cc1d-4deb-b638-f7b87c3e5dbd


## Get and dump our entities

In [3]:
def load_table(namespace, workspace, table_name, store_membership=False):
    ent_old = fapi.get_entities(namespace, workspace, table_name).json()
    tbl_old = None

    membership = None
    if len(ent_old) > 0:
        tbl_old = pd.DataFrame(list(map(lambda e: e['attributes'], ent_old)))
        tbl_old[f"entity:{table_name}_id"] = list(map(lambda f: f['name'], ent_old))

        if store_membership:
            membership = list(map(lambda g: set(map(lambda h: h['entityName'], g['items'])), tbl_old['samples']))
            del tbl_old['samples']

        c = list(tbl_old.columns)
        c.remove(f"entity:{table_name}_id")
        c = [f"entity:{table_name}_id"] + c
        tbl_old = tbl_old[c]
        tbl_old = tbl_old.astype(str)

    return tbl_old, membership

# Remove any `nan` values in a given dataframe.
# `nan` values are caused by a parsing issue and are artifacts.
def fix_nans(df, quiet=True):
    if not quiet: print("Replacing all `nan` values with empty strings: ")
    for c in df.columns.values:
        nan_types = ("nan", float('nan'))
        has_nan = False
        num_denaned = 0
        for n in nan_types:
            if (sum(df[c] == n) > 0):
                num_denaned += sum(df[c] == n)
                df.loc[df[c] == n, c] = ""
                has_nan = True
        if has_nan and not quiet:
            print(f"\t{c}: {num_denaned}")

    if not quiet: print("Replacing numpy nan values...")
    if not quiet: print("Done.")
    return df.replace(np.nan, "")

In [4]:
# Get our entity types so we know what to dump:
entity_types = fapi.list_entity_types(namespace, workspace).json()

timestamp = datetime.datetime.now().strftime("%Y%m%dT%H%M%S")
workspace_bucket = fapi.get_workspace(namespace, workspace).json()["workspace"]["bucketName"]
backup_folder_path = f"backups/{timestamp}"

In [5]:
# # Create our timestamped backup bucket:
# storage_client = storage.Client()
# bucket = storage_client.bucket(workspace_bucket)

# # Iterate over entity types and dump each one to a separate TSV:
# print(f"Writing workspace entities to backup dir:")
# print(f"gs://{workspace_bucket}/{backup_folder_path}")
# for et in entity_types:
#     print(f"\t{et}")
#     tbl, _ = load_table(namespace, workspace, et)
#     tbl = fix_nans(tbl)
#     table_name = f"{timestamp}_{namespace}_{workspace}_{et}.tsv"
    
#     # Write our table to our bucket:
#     blob = bucket.blob(f"{backup_folder_path}/tables/{table_name}")
#     with blob.open('w') as f:
#         tbl.to_csv(f, sep="\t", index=False)
# print('Done.')

In [6]:
# Create our timestamped backup bucket:
storage_client = storage.Client()
bucket = storage_client.bucket(workspace_bucket)

# Iterate over entity types and dump each one to a separate TSV:
print(f"Writing workspace entities to backup dir:")
print(f"gs://{workspace_bucket}/{backup_folder_path}")
for et in entity_types:
    print(f"\t{et}")
    tbl, _ = load_table(namespace, workspace, et)
    tbl = fix_nans(tbl)
    table_name = f"{timestamp}_{namespace}_{workspace}_{et}.tsv.gz"
    
    # Write our table to our bucket:
    blob = bucket.blob(f"{backup_folder_path}/tables/{table_name}")
    
    with io.StringIO() as buf:
        tbl.to_csv(buf, sep="\t", index=False)
        with blob.open('wb') as f:
            f.write(gzip.compress(bytes(buf.getvalue(), 'utf-8')))
print('Done.')

Writing workspace entities to backup dir:
gs://fc-b06e896e-cc1d-4deb-b638-f7b87c3e5dbd/backups/20231218T153028
	sample_set_set
	z_external_pipeline_validation_set
	high_quality_assembly
	sample_set
	pfcrosses
	validation_set
	sample
	validation
	z_external_pipeline_validation
	remote_dataset
Done.


In [7]:
# Now backup the notebooks:
print("Writing notebooks to backup dir:")
print(f"gs://{workspace_bucket}/{backup_folder_path}")
for notebook_blob in storage_client.list_blobs(workspace_bucket, prefix='notebooks'):
    original_name = notebook_blob.name[notebook_blob.name.find("/")+1:]
    print(f"\t{original_name}")
    notebook_name = f"{timestamp}_{namespace}_{workspace}_{original_name}" 
    blob = bucket.copy_blob(blob, bucket, new_name=f"{backup_folder_path}/notebooks/{notebook_name}")
print("Done.")

Writing notebooks to backup dir:
gs://fc-b06e896e-cc1d-4deb-b638-f7b87c3e5dbd/backups/20231218T153028
	00_backup_workspace.ipynb
	01_import_new_flowcell_data_from_tsv.ipynb
	02_add_new_flowcells_to_sample_table.ipynb
	10_hail_pca_analysis.ipynb
	99_fix_sample_table_bam_entries.ipynb
	bknight.ipynb
	drug_resistance_heatmap.ipynb
	hail_playground.ipynb
	inspect_joint_call_cohort_with_IGV.ipynb
	inspect_sample_with_IGV.ipynb
	update_sample_table.ipynb
Done.


In [8]:
import pytz
now_utc = datetime.datetime.utcnow()
timezone = pytz.timezone('America/New_York')
now_et = now_utc.astimezone(timezone)
time_string = now_et.strftime("%A %B %d at %H:%M:%S ET")
print(f"Backup completed on {time_string}")

Backup completed on Monday December 18 at 10:31:18 ET
