In [1]:
## imports and environment variables
# imports
from firecloud import api as fapi
import json
import os
import pandas as pd

# workspace environment variables
ws_name = os.environ["WORKSPACE_NAME"]
ws_project = os.environ["WORKSPACE_NAMESPACE"]
ws_bucket = os.environ["WORKSPACE_BUCKET"]

print(f"workspace name = {ws_name}")
print(f"workspace project = {ws_project}")
print(f"workspace bucket = {ws_bucket}")

workspace name = sushmac_sandbox_broad-firecloud-dsde
workspace project = broad-firecloud-dsde
workspace bucket = gs://fc-0ee5a557-571c-41eb-8679-a518582c48ac


In [2]:
# gets list of entity types in workspace
# TODO: ISSUE - cannot download the _set table tsv files with this FISS call. Downloads a zip.
# TODO: QUESTIOON - do we need set files from AnVIL workspaces since we cannot load them into TDR as sets?

# API call to get all entity types in workspace
response_etypes = fapi.list_entity_types(ws_project, ws_name)
dict_all_etypes = json.loads(response_etypes.text)

etypes_list = []
etypes_list = [key for key in dict_all_etypes.keys()]

print(f"List of entity types in current workspace:")
print('\n'.join(['\t' * 5 + c for c in etypes_list]))

List of entity types in current workspace:
					participant
					map_set
					column_order
					fastqs
					map
					new_table
					sample_set
					ubams
					check_cases_set
					check_cases
					ubams_set
					sample
					fastqs_set


In [3]:
# for each entity type, download tsv file to notebook PD

# initiate validation data list to capture entity counts in workspace vs counts in tsv
# want to compare that all rows successfully downloaded to tsvs
# items in list = [{"entity_type": "table_name", "data_model_count": #, "tsv_file_count": #},{...}]
validation_data = []

for etype in etypes_list:
    print(f'Starting download of tsv file for {etype}.')
    
    # get entity table response for API call
    res_etype = fapi.get_entities_tsv(ws_project, ws_name, etype, model="flexible")
    
    # Save current/original data model tsv files to the bucket for provenance
    destination_dir = "original_workspace_loadfiles"
    print(f'Saving original {etype} TSV to {ws_bucket}/{destination_dir}')
    original_tsv_name = "original_" + etype + "_table.tsv"
    with open(original_tsv_name, "w") as f:
        f.write(res_etype.text)
    
    # get number of rows in downloaded tsv file for given entity and update validation dict with count
    num_tsv_entities = !tail -n +2 $original_tsv_name | wc -l
    
    # capture counts of given entity into dictionary
    validation_dict = {}
    validation_dict["entity_type"] = etype
    validation_dict["tsv_file_count"] = num_tsv_entities[0]
    validation_dict["data_model_count"] = dict_all_etypes[etype]["count"]
    
    validation_data.append(validation_dict)

    # copy files to workspace bucket
    !gsutil cp $original_tsv_name $ws_bucket/$destionation_dir/ 2> stdout
    

Starting download of tsv file for participant.
Saving original participant TSV to gs://fc-0ee5a557-571c-41eb-8679-a518582c48ac/original_workspace_loadfiles
Starting download of tsv file for map_set.
Saving original map_set TSV to gs://fc-0ee5a557-571c-41eb-8679-a518582c48ac/original_workspace_loadfiles
Starting download of tsv file for column_order.
Saving original column_order TSV to gs://fc-0ee5a557-571c-41eb-8679-a518582c48ac/original_workspace_loadfiles
Starting download of tsv file for fastqs.
Saving original fastqs TSV to gs://fc-0ee5a557-571c-41eb-8679-a518582c48ac/original_workspace_loadfiles
Starting download of tsv file for map.
Saving original map TSV to gs://fc-0ee5a557-571c-41eb-8679-a518582c48ac/original_workspace_loadfiles
Starting download of tsv file for new_table.
Saving original new_table TSV to gs://fc-0ee5a557-571c-41eb-8679-a518582c48ac/original_workspace_loadfiles
Starting download of tsv file for sample_set.
Saving original sample_set TSV to gs://fc-0ee5a557-571

In [4]:
## print validation dataframe for visual inspection

# set values differ because we have to determine how to get set files downloaded
# TODO: only list rows where the numbers don't match or highlight rows where numbers don't match up
validation_df = pd.DataFrame(validation_data)
validation_df

Unnamed: 0,entity_type,tsv_file_count,data_model_count
0,participant,4,4
1,map_set,0,4
2,column_order,1,1
3,fastqs,2,2
4,map,7,7
5,new_table,2,2
6,sample_set,0,3
7,ubams,1,1
8,check_cases_set,1,2
9,check_cases,4,6


In [None]:
# delete copy of tsv files from notebook env - they will persist in designated workspace bucket directory
!rm *original*.tsv

In [5]:
!ls

 anvil-tdr-ingest.ipynb
 BETA_Remove_Workflow_Intermediates_v2.ipynb
 dalmatian-read-entities.ipynb
 DAP-paged_tables.ipynb
 fix_data_paths_no_imports_sushmac.ipynb
'Intro to FISS API in Python.ipynb'
'Intro to Fiss API in R.ipynb'
 Intro-to-Jupyter-Notebooks-test.ipynb
 janis_hello_world.ipynb
'notebooks_Variant QC and GWAS.ipynb'
 original_check_cases_set_table.tsv
 original_check_cases_table.tsv
 original_column_order_table.tsv
 original_fastqs_set_table.tsv
 original_fastqs_table.tsv
 original_map_set_table.tsv
 original_map_table.tsv
 original_new_table_table.tsv
 original_participant_table.tsv
 original_sample_set_table.tsv
 original_sample_table.tsv
 original_ubams_set_table.tsv
 original_ubams_table.tsv
 read_in_set_tables.ipynb
 Remove_Workflow_intermediates.ipynb
 resolve_drs_uris.ipynb
 R-read-data-from-bucket.ipynb
 stdout
 test3_181126_py3.ipynb
 testing_analyses_tab.ipynb
