In [None]:
# Version History
#print("Version 1.0.0: 09/08/2022 7:48pm - Nate Calvanese - First version created")
#print("Version 1.0.1: 09/14/2022 3:55pm - Nate Calvanese - Added workspace_attribute table creation")
#print("Version 1.0.2: 09/16/2022 8:23am - Nate Calvanese - Prefixed source tables")
#print("Version 1.0.3: 09/23/2022 11:53am - Nate Calvanese - Made source workspace configurable")
#print("Version 1.0.5: 09/23/2022 11:53am - Nate Calvanese - Appended source workspace to file names")
#print("Version 1.0.6: 10/04/2022 3:35pm - Nate Calvanese - Added support for set tables")
#print("Version 1.0.7: 10/07/2022 4:07pm - Nate Calvanese - Switched to parsing entity model into TSV")
print("Version 1.0.8: 10/12/2022 3:50pm - Nate Calvanese - Reverted source table prefixing")


In [None]:
#!pip install --upgrade import_ipynb data_repo_client

In [None]:
## Imports and environment variables

# Imports
import import_ipynb
from firecloud import api as fapi
import json
import os
import pandas as pd
import re
import logging
import ingest_pipeline_utilities as utils

# Configure logging format
logging.basicConfig(format="%(asctime)s - %(levelname)s: %(message)s", datefmt="%m/%d/%Y %I:%M:%S %p", level=logging.INFO)

# workspace environment variables
ws_name = os.environ["WORKSPACE_NAME"]
ws_project = os.environ["WORKSPACE_NAMESPACE"]
ws_bucket = os.environ["WORKSPACE_BUCKET"]
ws_bucket_name = re.sub('^gs://', '', ws_bucket)

# print(f"workspace name = {ws_name}")
# print(f"workspace project = {ws_project}")
# print(f"workspace bucket = {ws_bucket}")
# print(f"workspace bucket name = {ws_bucket_name}")

In [None]:
# Main source connector function
def create_source_table_data_files(params):
    
    # Initialize parameters
    log_status = "Success"
    log_string = ""
    validation_data = []
    destination_dir = params["input_dir"]
    src_ws_project = params["src_ws_project"]
    src_ws_name = params["src_ws_name"]
    
    # Collect and record all entity types in workspace
    response_etypes = fapi.list_entity_types(src_ws_project, src_ws_name)
    dict_all_etypes = json.loads(response_etypes.text)
    etypes_list = []
    etypes_list = [key for key in dict_all_etypes.keys()]
    logging.info(f"List of entity tables in current workspace: " + ', '.join(etypes_list))
    
    # Loop through the entity types, pull JSON structure, convert to tsv, validate record counts, and transfer to cloud
    if etypes_list:
        for etype in etypes_list:
            logging.info(f'Starting download of tsv file for {etype} table.')

            # Get entity table information and convert to tsv structure
            res_etype = fapi.get_entities(src_ws_project, src_ws_name, etype)
            original_tsv_name = etype + "_" + src_ws_name + ".tsv"

            # Build tsv records from JSON entity model
            record_list = []
            for record in json.loads(res_etype.text):
                temp_dict = {}
                ordered_dict = {}
                pk_col_name = record["entityType"] + "_id"
                pk_val = record["name"]
                ordered_dict[pk_col_name] = pk_val
                try:
                    for key, val in record["attributes"].items():
                        col_name = key
                        # If value is a dictionary, evaluate whether it is an entity reference, a list of attributes, or a list of entity references
                        if isinstance(val, dict):
                            if val.get("entityName"):
                                col_val = val["entityName"]
                                temp_dict[col_name] = col_val
                            elif val.get("items"):
                                col_val = []
                                for entry in val["items"]:
                                    if isinstance(entry, dict):
                                        if entry.get("entityName"):
                                            col_val.append('"' + entry["entityName"] + '"')
                                    else:
                                        col_val.append('"' + entry + '"')
                                temp_dict[col_name] = "[" + ",".join(col_val) + "]"
                        else:
                            col_val = val
                            temp_dict[col_name] = col_val
                except:
                    pass
                # Sort the returned record alphabetically, keeping the pk column as the first entry
                temp_dict_sorted = {key: value for key, value in sorted(temp_dict.items())}
                for key, val in temp_dict_sorted.items():
                    if key not in ordered_dict:
                        ordered_dict[key] = val 
                record_list.append(ordered_dict)
            df = pd.DataFrame.from_records(record_list)
            df.to_csv(original_tsv_name, index=False, sep="\t")
            
            # Get number of rows in downloaded tsv file for given entity
            num_tsv_entities = !tail -n +2 $original_tsv_name | wc -l

            # Capture entity record counts into a validation dictionary
            validation_dict = {}
            validation_dict["entity_type"] = etype
            validation_dict["tsv_file_count"] = num_tsv_entities[0]
            validation_dict["data_model_count"] = dict_all_etypes[etype]["count"]
            if int(num_tsv_entities[0]) == int(dict_all_etypes[etype]["count"]):
                validation_dict["record_count_validation"] = "Passed"
            else:
                validation_dict["record_count_validation"] = "Failed"
                log_status = "Warning"
            validation_data.append(validation_dict)

            # Copy tsv file to workspace bucket
            logging.info(f'Copying {original_tsv_name} to {ws_bucket}/{destination_dir}/{etype}')
            !gsutil cp $original_tsv_name $ws_bucket/$destination_dir/$etype/ 2> stdout

    # Grab workspace attributes, flatten, convert to tsv, and transfer to cloud
    logging.info(f'Starting download of workspace attribute information.')
    etype = "workspace_attributes"
    ws_attributes_file = etype + "_" + src_ws_name + ".tsv"
    raw_ws_attributes = utils.get_workspace_attributes(src_ws_project, src_ws_name)
    ws_attributes = {}
    for key, val in raw_ws_attributes["attributes"].items():
        if isinstance(val, list):
            val_str = ", ".join(val)
            ws_attributes[key] = val_str.replace("\t", "")
        else:
            ws_attributes[key] = str(val).replace("\t", "")
    ws_df = pd.DataFrame.from_dict(ws_attributes, orient="index")
    ws_df.reset_index(inplace=True)
    ws_df.columns =["attribute", "value"] 
    ws_df.to_csv(ws_attributes_file, sep="\t", index=False)
    logging.info(f"Copying {ws_attributes_file} to {ws_bucket}/{destination_dir}/{etype}")
    !gsutil cp $ws_attributes_file $ws_bucket/$destination_dir/$etype/ 2> stdout
    
    # Dump validation data to log string to return to caller and display results
    log_string = json.dumps(validation_data)
    validation_df = pd.DataFrame(validation_data)
    logging.info("Download and copy of tsv files complete. Validation results: \n")
    display(validation_df)

    # Delete copy of tsv files from notebook env - they will persist in designated workspace bucket directory
    !rm *.tsv
    
    # Return log variables
    return log_status, log_string

In [None]:
# Test
# params = {}
# params["input_dir"] = "ingest_pipeline/input/test/table_data"
# params["src_ws_project"] = "anvil-datastorage"
# params["src_ws_name"] = "AnVIL_ccdg_asc_ndd_daly_talkowski_ac-boston_asd_exome"
# log_status, log_string = create_source_table_data_files(params)

In [None]:
# Test 2
# params = {}
# params["input_dir"] = "ingest_pipeline/input/test2/table_data"
# params["src_ws_project"] = "gro-share-seq-computational"
# params["src_ws_name"] = "GRO_share_seq_computational_testing"
# log_status, log_string = create_source_table_data_files(params)