#### Section 1:  set up "global" imports and environment variables

In [1]:
## imports and environment variables
# imports
from firecloud import api as fapi
import json
import os
import pandas as pd

# workspace environment variables
ws_name = os.environ["WORKSPACE_NAME"]
ws_project = os.environ["WORKSPACE_NAMESPACE"]
ws_bucket = os.environ["WORKSPACE_BUCKET"]

print(f"workspace name = {ws_name}")
print(f"workspace project = {ws_project}")
print(f"workspace bucket = {ws_bucket}")


workspace name = tdr-anvil-ingest-bjt
workspace project = dsp-data-ingest
workspace bucket = gs://fc-secure-e7856519-5bea-4fec-88ec-dad61673d22f


#### Section 2: get source data files and complete basic validation

In [2]:
def create_source_files():
    # gets list of entity types in workspace

    # API call to get all entity types in workspace
    response_etypes = fapi.list_entity_types(ws_project, ws_name)
    dict_all_etypes = json.loads(response_etypes.text)

    etypes_list = []
    etypes_list = [key for key in dict_all_etypes.keys()]

    print(f"List of entity types in current workspace:")
    print('\n'.join(['\t' * 5 + c for c in etypes_list]))
    
    # for each entity type, download tsv file to notebook PD

    # initiate validation data list to capture entity counts in workspace vs counts in tsv
    # want to compare that all rows successfully downloaded to tsvs
    # items in list = [{"entity_type": "table_name", "data_model_count": #, "tsv_file_count": #},{...}]
    validation_data = []

    for etype in etypes_list:
        print(f'Starting download of tsv file for {etype}.')

        # get entity table response for API call
        res_etype = fapi.get_entities_tsv(ws_project, ws_name, etype, model="flexible")

        # Save current/original data model tsv files to the bucket for provenance
        destination_dir = "ingest_pipeline/input/metadata"
        print(f'Saving original {etype} TSV to {ws_bucket}/{destination_dir}')
        original_tsv_name = etype + ".tsv"
        with open(original_tsv_name, "w") as f:
            f.write(res_etype.text)

        # get number of rows in downloaded tsv file for given entity and update validation dict with count
        num_tsv_entities = !tail -n +2 $original_tsv_name | wc -l

        # capture counts of given entity into dictionary
        validation_dict = {}
        validation_dict["entity_type"] = etype
        validation_dict["tsv_file_count"] = num_tsv_entities[0]
        validation_dict["data_model_count"] = dict_all_etypes[etype]["count"]

        validation_data.append(validation_dict)

        # copy files to workspace bucket
        !gsutil cp $original_tsv_name $ws_bucket/$destination_dir/ 2> stdout
        
        ## print validation dataframe for visual inspection

    # set values differ because we have to determine how to get set files downloaded
    # TODO: only list rows where the numbers don't match or highlight rows where numbers don't match up
    validation_df = pd.DataFrame(validation_data)
    print(f"source files validation metrics: \n")
    print(validation_df)

    # write validation df to file and copy file to destination_dir
    validation_metrics_filename = f"{ws_name}_src_file_validation_metrics.csv"
    validation_df.to_csv(validation_metrics_filename, sep='\t')

    !gsutil cp $validation_metrics_filename $ws_bucket/$destination_dir/

    print(f"\n validation metrics: {ws_bucket}/{destination_dir}/{validation_metrics_filename}")
    
    # delete copy of tsv files from notebook env - they will persist in designated workspace bucket directory
    !rm *.tsv
    

#### Section 3: get workspace metadata and dataset attributes

In [3]:
# get workspace attributes and phs ID functions
def get_workspace_attributes(ws_project, ws_name):
    """Get workspace attributes, write to json, copy json to workspace bucket."""
    
    ws_attributes = fapi.get_workspace(ws_project, ws_name, fields="workspace.attributes \n").json()
    
    destination_dir = "ingest_pipeline/input/metadata"

    # write json to file and save in directory with metadata
    ws_attrs_filename = f"{ws_name}_workspace_attributes.json"
    with open(ws_attrs_filename, 'w') as json_outfile:
        json.dump(ws_attributes, json_outfile)
        # copy json file to bucket
        !gsutil cp $ws_attrs_filename $ws_bucket/$destination_dir/ 2> stdout
        print(f"workspace attributes: {ws_bucket}/{destination_dir}/{ws_attrs_filename}")
    
    return ws_attributes

def get_workspace_phs_id(ws_attributes):
    """Get workspace's phs ID from workspace attributes."""

    # parse workspace attributes to get phsID from tags
    tags_list = ws_attributes["workspace"]["attributes"]["tag:tags"]["items"]

    tags_dict = {}
    for key_value in tags_list:
        key, value = key_value.split(': ', 1)
        tags_dict[key] = value

    phs_id = tags_dict["dbGaP"]
    print(f"phs ID for workspace: {phs_id}")
    
    return phs_id
        